Убрано лишнее из CMSIS библиотеки

Добавление подключение DSP в конфиг периферии
This commit is contained in:
Razvalyaev 2025-11-15 08:22:07 +03:00
parent 5a03fbb513
commit d7dec9df35
158 changed files with 106 additions and 36689 deletions

View File

@ -1,563 +0,0 @@
/**************************************************************************//**
* @file cmsis_armcc.h
* @brief CMSIS compiler specific macros, functions, instructions
* @version V1.0.5
* @date 05. May 2021
******************************************************************************/
/*
* Copyright (c) 2009-2021 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __CMSIS_ARMCC_H
#define __CMSIS_ARMCC_H
#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 400677)
#error "Please use Arm Compiler Toolchain V4.0.677 or later!"
#endif
/* CMSIS compiler control architecture macros */
#if (defined (__TARGET_ARCH_7_A ) && (__TARGET_ARCH_7_A == 1))
#define __ARM_ARCH_7A__ 1
#endif
/* CMSIS compiler specific defines */
#ifndef __ASM
#define __ASM __asm
#endif
#ifndef __INLINE
#define __INLINE __inline
#endif
#ifndef __FORCEINLINE
#define __FORCEINLINE __forceinline
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static __inline
#endif
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE static __forceinline
#endif
#ifndef __NO_RETURN
#define __NO_RETURN __declspec(noreturn)
#endif
#ifndef CMSIS_DEPRECATED
#define CMSIS_DEPRECATED __attribute__((deprecated))
#endif
#ifndef __USED
#define __USED __attribute__((used))
#endif
#ifndef __WEAK
#define __WEAK __attribute__((weak))
#endif
#ifndef __PACKED
#define __PACKED __attribute__((packed))
#endif
#ifndef __PACKED_STRUCT
#define __PACKED_STRUCT __packed struct
#endif
#ifndef __UNALIGNED_UINT16_WRITE
#define __UNALIGNED_UINT16_WRITE(addr, val) ((*((__packed uint16_t *)(addr))) = (val))
#endif
#ifndef __UNALIGNED_UINT16_READ
#define __UNALIGNED_UINT16_READ(addr) (*((const __packed uint16_t *)(addr)))
#endif
#ifndef __UNALIGNED_UINT32_WRITE
#define __UNALIGNED_UINT32_WRITE(addr, val) ((*((__packed uint32_t *)(addr))) = (val))
#endif
#ifndef __UNALIGNED_UINT32_READ
#define __UNALIGNED_UINT32_READ(addr) (*((const __packed uint32_t *)(addr)))
#endif
#ifndef __ALIGNED
#define __ALIGNED(x) __attribute__((aligned(x)))
#endif
#ifndef __PACKED
#define __PACKED __attribute__((packed))
#endif
#ifndef __COMPILER_BARRIER
#define __COMPILER_BARRIER() __memory_changed()
#endif
/* ########################## Core Instruction Access ######################### */
/**
\brief No Operation
*/
#define __NOP __nop
/**
\brief Wait For Interrupt
*/
#define __WFI __wfi
/**
\brief Wait For Event
*/
#define __WFE __wfe
/**
\brief Send Event
*/
#define __SEV __sev
/**
\brief Instruction Synchronization Barrier
*/
#define __ISB() __isb(0xF)
/**
\brief Data Synchronization Barrier
*/
#define __DSB() __dsb(0xF)
/**
\brief Data Memory Barrier
*/
#define __DMB() __dmb(0xF)
/**
\brief Reverse byte order (32 bit)
\details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
\param [in] value Value to reverse
\return Reversed value
*/
#define __REV __rev
/**
\brief Reverse byte order (16 bit)
\details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
\param [in] value Value to reverse
\return Reversed value
*/
#ifndef __NO_EMBEDDED_ASM
__attribute__((section(".rev16_text"))) __STATIC_INLINE __ASM uint32_t __REV16(uint32_t value)
{
rev16 r0, r0
bx lr
}
#endif
/**
\brief Reverse byte order (16 bit)
\details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
\param [in] value Value to reverse
\return Reversed value
*/
#ifndef __NO_EMBEDDED_ASM
__attribute__((section(".revsh_text"))) __STATIC_INLINE __ASM int16_t __REVSH(int16_t value)
{
revsh r0, r0
bx lr
}
#endif
/**
\brief Rotate Right in unsigned value (32 bit)
\param [in] op1 Value to rotate
\param [in] op2 Number of Bits to rotate
\return Rotated value
*/
#define __ROR __ror
/**
\brief Breakpoint
\param [in] value is ignored by the processor.
If required, a debugger can use it to store additional information about the breakpoint.
*/
#define __BKPT(value) __breakpoint(value)
/**
\brief Reverse bit order of value
\param [in] value Value to reverse
\return Reversed value
*/
#define __RBIT __rbit
/**
\brief Count leading zeros
\param [in] value Value to count the leading zeros
\return number of leading zeros in value
*/
#define __CLZ __clz
/**
\brief LDR Exclusive (8 bit)
\details Executes a exclusive LDR instruction for 8 bit value.
\param [in] ptr Pointer to data
\return value of type uint8_t at (*ptr)
*/
#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
#define __LDREXB(ptr) ((uint8_t ) __ldrex(ptr))
#else
#define __LDREXB(ptr) _Pragma("push") _Pragma("diag_suppress 3731") ((uint8_t ) __ldrex(ptr)) _Pragma("pop")
#endif
/**
\brief LDR Exclusive (16 bit)
\details Executes a exclusive LDR instruction for 16 bit values.
\param [in] ptr Pointer to data
\return value of type uint16_t at (*ptr)
*/
#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
#define __LDREXH(ptr) ((uint16_t) __ldrex(ptr))
#else
#define __LDREXH(ptr) _Pragma("push") _Pragma("diag_suppress 3731") ((uint16_t) __ldrex(ptr)) _Pragma("pop")
#endif
/**
\brief LDR Exclusive (32 bit)
\details Executes a exclusive LDR instruction for 32 bit values.
\param [in] ptr Pointer to data
\return value of type uint32_t at (*ptr)
*/
#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
#define __LDREXW(ptr) ((uint32_t ) __ldrex(ptr))
#else
#define __LDREXW(ptr) _Pragma("push") _Pragma("diag_suppress 3731") ((uint32_t ) __ldrex(ptr)) _Pragma("pop")
#endif
/**
\brief STR Exclusive (8 bit)
\details Executes a exclusive STR instruction for 8 bit values.
\param [in] value Value to store
\param [in] ptr Pointer to location
\return 0 Function succeeded
\return 1 Function failed
*/
#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
#define __STREXB(value, ptr) __strex(value, ptr)
#else
#define __STREXB(value, ptr) _Pragma("push") _Pragma("diag_suppress 3731") __strex(value, ptr) _Pragma("pop")
#endif
/**
\brief STR Exclusive (16 bit)
\details Executes a exclusive STR instruction for 16 bit values.
\param [in] value Value to store
\param [in] ptr Pointer to location
\return 0 Function succeeded
\return 1 Function failed
*/
#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
#define __STREXH(value, ptr) __strex(value, ptr)
#else
#define __STREXH(value, ptr) _Pragma("push") _Pragma("diag_suppress 3731") __strex(value, ptr) _Pragma("pop")
#endif
/**
\brief STR Exclusive (32 bit)
\details Executes a exclusive STR instruction for 32 bit values.
\param [in] value Value to store
\param [in] ptr Pointer to location
\return 0 Function succeeded
\return 1 Function failed
*/
#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
#define __STREXW(value, ptr) __strex(value, ptr)
#else
#define __STREXW(value, ptr) _Pragma("push") _Pragma("diag_suppress 3731") __strex(value, ptr) _Pragma("pop")
#endif
/**
\brief Remove the exclusive lock
\details Removes the exclusive lock which is created by LDREX.
*/
#define __CLREX __clrex
/**
\brief Signed Saturate
\details Saturates a signed value.
\param [in] value Value to be saturated
\param [in] sat Bit position to saturate to (1..32)
\return Saturated value
*/
#define __SSAT __ssat
/**
\brief Unsigned Saturate
\details Saturates an unsigned value.
\param [in] value Value to be saturated
\param [in] sat Bit position to saturate to (0..31)
\return Saturated value
*/
#define __USAT __usat
/* ########################### Core Function Access ########################### */
/**
\brief Enable IRQ Interrupts
\details Enables IRQ interrupts by clearing the I-bit in the CPSR.
Can only be executed in Privileged modes.
*/
/* intrinsic void __enable_irq(); */
/**
\brief Disable IRQ Interrupts
\details Disables IRQ interrupts by setting the I-bit in the CPSR.
Can only be executed in Privileged modes.
*/
/* intrinsic void __disable_irq(void); */
/**
\brief Enable FIQ
\details Enables FIQ interrupts by clearing the F-bit in the CPSR.
Can only be executed in Privileged modes.
*/
#define __enable_fault_irq __enable_fiq
/**
\brief Disable FIQ
\details Disables FIQ interrupts by setting the F-bit in the CPSR.
Can only be executed in Privileged modes.
*/
#define __disable_fault_irq __disable_fiq
/**
\brief Get FPSCR (Floating Point Status/Control)
\return Floating Point Status/Control register value
*/
__STATIC_INLINE uint32_t __get_FPSCR(void)
{
#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
(defined (__FPU_USED ) && (__FPU_USED == 1U)) )
register uint32_t __regfpscr __ASM("fpscr");
return(__regfpscr);
#else
return(0U);
#endif
}
/**
\brief Set FPSCR (Floating Point Status/Control)
\param [in] fpscr Floating Point Status/Control value to set
*/
__STATIC_INLINE void __set_FPSCR(uint32_t fpscr)
{
#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
(defined (__FPU_USED ) && (__FPU_USED == 1U)) )
register uint32_t __regfpscr __ASM("fpscr");
__regfpscr = (fpscr);
#else
(void)fpscr;
#endif
}
/** \brief Get CPSR (Current Program Status Register)
\return CPSR Register value
*/
__STATIC_INLINE uint32_t __get_CPSR(void)
{
register uint32_t __regCPSR __ASM("cpsr");
return(__regCPSR);
}
/** \brief Set CPSR (Current Program Status Register)
\param [in] cpsr CPSR value to set
*/
__STATIC_INLINE void __set_CPSR(uint32_t cpsr)
{
register uint32_t __regCPSR __ASM("cpsr");
__regCPSR = cpsr;
}
/** \brief Get Mode
\return Processor Mode
*/
__STATIC_INLINE uint32_t __get_mode(void)
{
return (__get_CPSR() & 0x1FU);
}
/** \brief Set Mode
\param [in] mode Mode value to set
*/
__STATIC_INLINE __ASM void __set_mode(uint32_t mode)
{
MOV r1, lr
MSR CPSR_C, r0
BX r1
}
/** \brief Get Stack Pointer
\return Stack Pointer
*/
__STATIC_INLINE __ASM uint32_t __get_SP(void)
{
MOV r0, sp
BX lr
}
/** \brief Set Stack Pointer
\param [in] stack Stack Pointer value to set
*/
__STATIC_INLINE __ASM void __set_SP(uint32_t stack)
{
MOV sp, r0
BX lr
}
/** \brief Get USR/SYS Stack Pointer
\return USR/SYSStack Pointer
*/
__STATIC_INLINE __ASM uint32_t __get_SP_usr(void)
{
ARM
PRESERVE8
MRS R1, CPSR
CPS #0x1F ;no effect in USR mode
MOV R0, SP
MSR CPSR_c, R1 ;no effect in USR mode
ISB
BX LR
}
/** \brief Set USR/SYS Stack Pointer
\param [in] topOfProcStack USR/SYS Stack Pointer value to set
*/
__STATIC_INLINE __ASM void __set_SP_usr(uint32_t topOfProcStack)
{
ARM
PRESERVE8
MRS R1, CPSR
CPS #0x1F ;no effect in USR mode
MOV SP, R0
MSR CPSR_c, R1 ;no effect in USR mode
ISB
BX LR
}
/** \brief Get FPEXC (Floating Point Exception Control Register)
\return Floating Point Exception Control Register value
*/
__STATIC_INLINE uint32_t __get_FPEXC(void)
{
#if (__FPU_PRESENT == 1)
register uint32_t __regfpexc __ASM("fpexc");
return(__regfpexc);
#else
return(0);
#endif
}
/** \brief Set FPEXC (Floating Point Exception Control Register)
\param [in] fpexc Floating Point Exception Control value to set
*/
__STATIC_INLINE void __set_FPEXC(uint32_t fpexc)
{
#if (__FPU_PRESENT == 1)
register uint32_t __regfpexc __ASM("fpexc");
__regfpexc = (fpexc);
#endif
}
/*
* Include common core functions to access Coprocessor 15 registers
*/
#define __get_CP(cp, op1, Rt, CRn, CRm, op2) do { register volatile uint32_t tmp __ASM("cp" # cp ":" # op1 ":c" # CRn ":c" # CRm ":" # op2); (Rt) = tmp; } while(0)
#define __set_CP(cp, op1, Rt, CRn, CRm, op2) do { register volatile uint32_t tmp __ASM("cp" # cp ":" # op1 ":c" # CRn ":c" # CRm ":" # op2); tmp = (Rt); } while(0)
#define __get_CP64(cp, op1, Rt, CRm) \
do { \
uint32_t ltmp, htmp; \
__ASM volatile("MRRC p" # cp ", " # op1 ", ltmp, htmp, c" # CRm); \
(Rt) = ((((uint64_t)htmp) << 32U) | ((uint64_t)ltmp)); \
} while(0)
#define __set_CP64(cp, op1, Rt, CRm) \
do { \
const uint64_t tmp = (Rt); \
const uint32_t ltmp = (uint32_t)(tmp); \
const uint32_t htmp = (uint32_t)(tmp >> 32U); \
__ASM volatile("MCRR p" # cp ", " # op1 ", ltmp, htmp, c" # CRm); \
} while(0)
#include "cmsis_cp15.h"
/** \brief Enable Floating Point Unit
Critical section, called from undef handler, so systick is disabled
*/
__STATIC_INLINE __ASM void __FPU_Enable(void)
{
ARM
//Permit access to VFP/NEON, registers by modifying CPACR
MRC p15,0,R1,c1,c0,2
ORR R1,R1,#0x00F00000
MCR p15,0,R1,c1,c0,2
//Ensure that subsequent instructions occur in the context of VFP/NEON access permitted
ISB
//Enable VFP/NEON
VMRS R1,FPEXC
ORR R1,R1,#0x40000000
VMSR FPEXC,R1
//Initialise VFP/NEON registers to 0
MOV R2,#0
//Initialise D16 registers to 0
VMOV D0, R2,R2
VMOV D1, R2,R2
VMOV D2, R2,R2
VMOV D3, R2,R2
VMOV D4, R2,R2
VMOV D5, R2,R2
VMOV D6, R2,R2
VMOV D7, R2,R2
VMOV D8, R2,R2
VMOV D9, R2,R2
VMOV D10,R2,R2
VMOV D11,R2,R2
VMOV D12,R2,R2
VMOV D13,R2,R2
VMOV D14,R2,R2
VMOV D15,R2,R2
IF {TARGET_FEATURE_EXTENSION_REGISTER_COUNT} == 32
//Initialise D32 registers to 0
VMOV D16,R2,R2
VMOV D17,R2,R2
VMOV D18,R2,R2
VMOV D19,R2,R2
VMOV D20,R2,R2
VMOV D21,R2,R2
VMOV D22,R2,R2
VMOV D23,R2,R2
VMOV D24,R2,R2
VMOV D25,R2,R2
VMOV D26,R2,R2
VMOV D27,R2,R2
VMOV D28,R2,R2
VMOV D29,R2,R2
VMOV D30,R2,R2
VMOV D31,R2,R2
ENDIF
//Initialise FPSCR to a known state
VMRS R1,FPSCR
LDR R2,=0x00086060 //Mask off all bits that do not have to be preserved. Non-preserved bits can/should be zero.
AND R1,R1,R2
VMSR FPSCR,R1
BX LR
}
#endif /* __CMSIS_ARMCC_H */

View File

@ -1,614 +0,0 @@
/**************************************************************************//**
* @file cmsis_armclang.h
* @brief CMSIS compiler specific macros, functions, instructions
* @version V1.2.1
* @date 05. May 2021
******************************************************************************/
/*
* Copyright (c) 2009-2021 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __CMSIS_ARMCLANG_H
#define __CMSIS_ARMCLANG_H
#pragma clang system_header /* treat file as system include file */
/* CMSIS compiler specific defines */
#ifndef __ASM
#define __ASM __asm
#endif
#ifndef __INLINE
#define __INLINE __inline
#endif
#ifndef __FORCEINLINE
#define __FORCEINLINE __attribute__((always_inline))
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static __inline
#endif
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE __attribute__((always_inline)) static __inline
#endif
#ifndef __NO_RETURN
#define __NO_RETURN __attribute__((__noreturn__))
#endif
#ifndef CMSIS_DEPRECATED
#define CMSIS_DEPRECATED __attribute__((deprecated))
#endif
#ifndef __USED
#define __USED __attribute__((used))
#endif
#ifndef __WEAK
#define __WEAK __attribute__((weak))
#endif
#ifndef __PACKED
#define __PACKED __attribute__((packed, aligned(1)))
#endif
#ifndef __PACKED_STRUCT
#define __PACKED_STRUCT struct __attribute__((packed, aligned(1)))
#endif
#ifndef __UNALIGNED_UINT16_WRITE
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpacked"
/*lint -esym(9058, T_UINT16_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_WRITE */
__PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
#pragma clang diagnostic pop
#define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
#endif
#ifndef __UNALIGNED_UINT16_READ
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpacked"
/*lint -esym(9058, T_UINT16_READ)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_READ */
__PACKED_STRUCT T_UINT16_READ { uint16_t v; };
#pragma clang diagnostic pop
#define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v)
#endif
#ifndef __UNALIGNED_UINT32_WRITE
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpacked"
/*lint -esym(9058, T_UINT32_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32_WRITE */
__PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
#pragma clang diagnostic pop
#define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
#endif
#ifndef __UNALIGNED_UINT32_READ
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpacked"
__PACKED_STRUCT T_UINT32_READ { uint32_t v; };
#pragma clang diagnostic pop
#define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v)
#endif
#ifndef __ALIGNED
#define __ALIGNED(x) __attribute__((aligned(x)))
#endif
#ifndef __PACKED
#define __PACKED __attribute__((packed))
#endif
#ifndef __COMPILER_BARRIER
#define __COMPILER_BARRIER() __ASM volatile("":::"memory")
#endif
/* ########################## Core Instruction Access ######################### */
/**
\brief No Operation
*/
#define __NOP __builtin_arm_nop
/**
\brief Wait For Interrupt
*/
#define __WFI __builtin_arm_wfi
/**
\brief Wait For Event
*/
#define __WFE __builtin_arm_wfe
/**
\brief Send Event
*/
#define __SEV __builtin_arm_sev
/**
\brief Instruction Synchronization Barrier
*/
#define __ISB() __builtin_arm_isb(0xF)
/**
\brief Data Synchronization Barrier
*/
#define __DSB() __builtin_arm_dsb(0xF)
/**
\brief Data Memory Barrier
*/
#define __DMB() __builtin_arm_dmb(0xF)
/**
\brief Reverse byte order (32 bit)
\details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
\param [in] value Value to reverse
\return Reversed value
*/
#define __REV(value) __builtin_bswap32(value)
/**
\brief Reverse byte order (16 bit)
\details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
\param [in] value Value to reverse
\return Reversed value
*/
#define __REV16(value) __ROR(__REV(value), 16)
/**
\brief Reverse byte order (16 bit)
\details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
\param [in] value Value to reverse
\return Reversed value
*/
#define __REVSH(value) (int16_t)__builtin_bswap16(value)
/**
\brief Rotate Right in unsigned value (32 bit)
\details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
\param [in] op1 Value to rotate
\param [in] op2 Number of Bits to rotate
\return Rotated value
*/
__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
{
op2 %= 32U;
if (op2 == 0U)
{
return op1;
}
return (op1 >> op2) | (op1 << (32U - op2));
}
/**
\brief Breakpoint
\param [in] value is ignored by the processor.
If required, a debugger can use it to store additional information about the breakpoint.
*/
#define __BKPT(value) __ASM volatile ("bkpt "#value)
/**
\brief Reverse bit order of value
\param [in] value Value to reverse
\return Reversed value
*/
#define __RBIT __builtin_arm_rbit
/**
\brief Count leading zeros
\param [in] value Value to count the leading zeros
\return number of leading zeros in value
*/
__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value)
{
/* Even though __builtin_clz produces a CLZ instruction on ARM, formally
__builtin_clz(0) is undefined behaviour, so handle this case specially.
This guarantees ARM-compatible results if happening to compile on a non-ARM
target, and ensures the compiler doesn't decide to activate any
optimisations using the logic "value was passed to __builtin_clz, so it
is non-zero".
ARM Compiler 6.10 and possibly earlier will optimise this test away, leaving a
single CLZ instruction.
*/
if (value == 0U)
{
return 32U;
}
return __builtin_clz(value);
}
/**
\brief LDR Exclusive (8 bit)
\details Executes a exclusive LDR instruction for 8 bit value.
\param [in] ptr Pointer to data
\return value of type uint8_t at (*ptr)
*/
#define __LDREXB (uint8_t)__builtin_arm_ldrex
/**
\brief LDR Exclusive (16 bit)
\details Executes a exclusive LDR instruction for 16 bit values.
\param [in] ptr Pointer to data
\return value of type uint16_t at (*ptr)
*/
#define __LDREXH (uint16_t)__builtin_arm_ldrex
/**
\brief LDR Exclusive (32 bit)
\details Executes a exclusive LDR instruction for 32 bit values.
\param [in] ptr Pointer to data
\return value of type uint32_t at (*ptr)
*/
#define __LDREXW (uint32_t)__builtin_arm_ldrex
/**
\brief STR Exclusive (8 bit)
\details Executes a exclusive STR instruction for 8 bit values.
\param [in] value Value to store
\param [in] ptr Pointer to location
\return 0 Function succeeded
\return 1 Function failed
*/
#define __STREXB (uint32_t)__builtin_arm_strex
/**
\brief STR Exclusive (16 bit)
\details Executes a exclusive STR instruction for 16 bit values.
\param [in] value Value to store
\param [in] ptr Pointer to location
\return 0 Function succeeded
\return 1 Function failed
*/
#define __STREXH (uint32_t)__builtin_arm_strex
/**
\brief STR Exclusive (32 bit)
\details Executes a exclusive STR instruction for 32 bit values.
\param [in] value Value to store
\param [in] ptr Pointer to location
\return 0 Function succeeded
\return 1 Function failed
*/
#define __STREXW (uint32_t)__builtin_arm_strex
/**
\brief Remove the exclusive lock
\details Removes the exclusive lock which is created by LDREX.
*/
#define __CLREX __builtin_arm_clrex
/**
\brief Signed Saturate
\details Saturates a signed value.
\param [in] value Value to be saturated
\param [in] sat Bit position to saturate to (1..32)
\return Saturated value
*/
#define __SSAT __builtin_arm_ssat
/**
\brief Unsigned Saturate
\details Saturates an unsigned value.
\param [in] value Value to be saturated
\param [in] sat Bit position to saturate to (0..31)
\return Saturated value
*/
#define __USAT __builtin_arm_usat
/* ################### Compiler specific Intrinsics ########################### */
/** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics
Access to dedicated SIMD instructions
@{
*/
#if (defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
#define __SADD8 __builtin_arm_sadd8
#define __SADD16 __builtin_arm_sadd16
#define __QADD8 __builtin_arm_qadd8
#define __QSUB8 __builtin_arm_qsub8
#define __QADD16 __builtin_arm_qadd16
#define __SHADD16 __builtin_arm_shadd16
#define __QSUB16 __builtin_arm_qsub16
#define __SHSUB16 __builtin_arm_shsub16
#define __QASX __builtin_arm_qasx
#define __SHASX __builtin_arm_shasx
#define __QSAX __builtin_arm_qsax
#define __SHSAX __builtin_arm_shsax
#define __SXTB16 __builtin_arm_sxtb16
#define __SMUAD __builtin_arm_smuad
#define __SMUADX __builtin_arm_smuadx
#define __SMLAD __builtin_arm_smlad
#define __SMLADX __builtin_arm_smladx
#define __SMLALD __builtin_arm_smlald
#define __SMLALDX __builtin_arm_smlaldx
#define __SMUSD __builtin_arm_smusd
#define __SMUSDX __builtin_arm_smusdx
#define __SMLSDX __builtin_arm_smlsdx
#define __USAT16 __builtin_arm_usat16
#define __SSUB8 __builtin_arm_ssub8
#define __SXTB16 __builtin_arm_sxtb16
#define __SXTAB16 __builtin_arm_sxtab16
__STATIC_FORCEINLINE int32_t __QADD( int32_t op1, int32_t op2)
{
int32_t result;
__ASM volatile ("qadd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE int32_t __QSUB( int32_t op1, int32_t op2)
{
int32_t result;
__ASM volatile ("qsub %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
#define __PKHBT(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0x0000FFFFUL) | \
((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL) )
#define __PKHTB(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0xFFFF0000UL) | \
((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL) )
__STATIC_FORCEINLINE int32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3)
{
int32_t result;
__ASM volatile ("smmla %0, %1, %2, %3" : "=r" (result): "r" (op1), "r" (op2), "r" (op3) );
return(result);
}
#endif /* (__ARM_FEATURE_DSP == 1) */
/* ########################### Core Function Access ########################### */
/**
\brief Enable IRQ Interrupts
\details Enables IRQ interrupts by clearing the I-bit in the CPSR.
Can only be executed in Privileged modes.
*/
__STATIC_FORCEINLINE void __enable_irq(void)
{
__ASM volatile ("cpsie i" : : : "memory");
}
/**
\brief Disable IRQ Interrupts
\details Disables IRQ interrupts by setting the I-bit in the CPSR.
Can only be executed in Privileged modes.
*/
__STATIC_FORCEINLINE void __disable_irq(void)
{
__ASM volatile ("cpsid i" : : : "memory");
}
/**
\brief Enable FIQ
\details Enables FIQ interrupts by clearing the F-bit in the CPSR.
Can only be executed in Privileged modes.
*/
__STATIC_FORCEINLINE void __enable_fault_irq(void)
{
__ASM volatile ("cpsie f" : : : "memory");
}
/**
\brief Disable FIQ
\details Disables FIQ interrupts by setting the F-bit in the CPSR.
Can only be executed in Privileged modes.
*/
__STATIC_FORCEINLINE void __disable_fault_irq(void)
{
__ASM volatile ("cpsid f" : : : "memory");
}
/**
\brief Get FPSCR
\details Returns the current value of the Floating Point Status/Control register.
\return Floating Point Status/Control register value
*/
#define __get_FPSCR __builtin_arm_get_fpscr
/**
\brief Set FPSCR
\details Assigns the given value to the Floating Point Status/Control register.
\param [in] fpscr Floating Point Status/Control value to set
*/
#define __set_FPSCR __builtin_arm_set_fpscr
/** \brief Get CPSR Register
\return CPSR Register value
*/
__STATIC_FORCEINLINE uint32_t __get_CPSR(void)
{
uint32_t result;
__ASM volatile("MRS %0, cpsr" : "=r" (result) );
return(result);
}
/** \brief Set CPSR Register
\param [in] cpsr CPSR value to set
*/
__STATIC_FORCEINLINE void __set_CPSR(uint32_t cpsr)
{
__ASM volatile ("MSR cpsr, %0" : : "r" (cpsr) : "cc", "memory");
}
/** \brief Get Mode
\return Processor Mode
*/
__STATIC_FORCEINLINE uint32_t __get_mode(void)
{
return (__get_CPSR() & 0x1FU);
}
/** \brief Set Mode
\param [in] mode Mode value to set
*/
__STATIC_FORCEINLINE void __set_mode(uint32_t mode)
{
__ASM volatile("MSR cpsr_c, %0" : : "r" (mode) : "memory");
}
/** \brief Get Stack Pointer
\return Stack Pointer value
*/
__STATIC_FORCEINLINE uint32_t __get_SP(void)
{
uint32_t result;
__ASM volatile("MOV %0, sp" : "=r" (result) : : "memory");
return result;
}
/** \brief Set Stack Pointer
\param [in] stack Stack Pointer value to set
*/
__STATIC_FORCEINLINE void __set_SP(uint32_t stack)
{
__ASM volatile("MOV sp, %0" : : "r" (stack) : "memory");
}
/** \brief Get USR/SYS Stack Pointer
\return USR/SYS Stack Pointer value
*/
__STATIC_FORCEINLINE uint32_t __get_SP_usr(void)
{
uint32_t cpsr;
uint32_t result;
__ASM volatile(
"MRS %0, cpsr \n"
"CPS #0x1F \n" // no effect in USR mode
"MOV %1, sp \n"
"MSR cpsr_c, %0 \n" // no effect in USR mode
"ISB" : "=r"(cpsr), "=r"(result) : : "memory"
);
return result;
}
/** \brief Set USR/SYS Stack Pointer
\param [in] topOfProcStack USR/SYS Stack Pointer value to set
*/
__STATIC_FORCEINLINE void __set_SP_usr(uint32_t topOfProcStack)
{
uint32_t cpsr;
__ASM volatile(
"MRS %0, cpsr \n"
"CPS #0x1F \n" // no effect in USR mode
"MOV sp, %1 \n"
"MSR cpsr_c, %0 \n" // no effect in USR mode
"ISB" : "=r"(cpsr) : "r" (topOfProcStack) : "memory"
);
}
/** \brief Get FPEXC
\return Floating Point Exception Control register value
*/
__STATIC_FORCEINLINE uint32_t __get_FPEXC(void)
{
#if (__FPU_PRESENT == 1)
uint32_t result;
__ASM volatile("VMRS %0, fpexc" : "=r" (result) : : "memory");
return(result);
#else
return(0);
#endif
}
/** \brief Set FPEXC
\param [in] fpexc Floating Point Exception Control value to set
*/
__STATIC_FORCEINLINE void __set_FPEXC(uint32_t fpexc)
{
#if (__FPU_PRESENT == 1)
__ASM volatile ("VMSR fpexc, %0" : : "r" (fpexc) : "memory");
#endif
}
/*
* Include common core functions to access Coprocessor 15 registers
*/
#define __get_CP(cp, op1, Rt, CRn, CRm, op2) __ASM volatile("MRC p" # cp ", " # op1 ", %0, c" # CRn ", c" # CRm ", " # op2 : "=r" (Rt) : : "memory" )
#define __set_CP(cp, op1, Rt, CRn, CRm, op2) __ASM volatile("MCR p" # cp ", " # op1 ", %0, c" # CRn ", c" # CRm ", " # op2 : : "r" (Rt) : "memory" )
#define __get_CP64(cp, op1, Rt, CRm) __ASM volatile("MRRC p" # cp ", " # op1 ", %Q0, %R0, c" # CRm : "=r" (Rt) : : "memory" )
#define __set_CP64(cp, op1, Rt, CRm) __ASM volatile("MCRR p" # cp ", " # op1 ", %Q0, %R0, c" # CRm : : "r" (Rt) : "memory" )
#include "cmsis_cp15.h"
/** \brief Enable Floating Point Unit
Critical section, called from undef handler, so systick is disabled
*/
__STATIC_INLINE void __FPU_Enable(void)
{
__ASM volatile(
//Permit access to VFP/NEON, registers by modifying CPACR
" MRC p15,0,R1,c1,c0,2 \n"
" ORR R1,R1,#0x00F00000 \n"
" MCR p15,0,R1,c1,c0,2 \n"
//Ensure that subsequent instructions occur in the context of VFP/NEON access permitted
" ISB \n"
//Enable VFP/NEON
" VMRS R1,FPEXC \n"
" ORR R1,R1,#0x40000000 \n"
" VMSR FPEXC,R1 \n"
//Initialise VFP/NEON registers to 0
" MOV R2,#0 \n"
//Initialise D16 registers to 0
" VMOV D0, R2,R2 \n"
" VMOV D1, R2,R2 \n"
" VMOV D2, R2,R2 \n"
" VMOV D3, R2,R2 \n"
" VMOV D4, R2,R2 \n"
" VMOV D5, R2,R2 \n"
" VMOV D6, R2,R2 \n"
" VMOV D7, R2,R2 \n"
" VMOV D8, R2,R2 \n"
" VMOV D9, R2,R2 \n"
" VMOV D10,R2,R2 \n"
" VMOV D11,R2,R2 \n"
" VMOV D12,R2,R2 \n"
" VMOV D13,R2,R2 \n"
" VMOV D14,R2,R2 \n"
" VMOV D15,R2,R2 \n"
#if (defined(__ARM_NEON) && (__ARM_NEON == 1))
//Initialise D32 registers to 0
" VMOV D16,R2,R2 \n"
" VMOV D17,R2,R2 \n"
" VMOV D18,R2,R2 \n"
" VMOV D19,R2,R2 \n"
" VMOV D20,R2,R2 \n"
" VMOV D21,R2,R2 \n"
" VMOV D22,R2,R2 \n"
" VMOV D23,R2,R2 \n"
" VMOV D24,R2,R2 \n"
" VMOV D25,R2,R2 \n"
" VMOV D26,R2,R2 \n"
" VMOV D27,R2,R2 \n"
" VMOV D28,R2,R2 \n"
" VMOV D29,R2,R2 \n"
" VMOV D30,R2,R2 \n"
" VMOV D31,R2,R2 \n"
#endif
//Initialise FPSCR to a known state
" VMRS R1,FPSCR \n"
" LDR R2,=0x00086060 \n" //Mask off all bits that do not have to be preserved. Non-preserved bits can/should be zero.
" AND R1,R1,R2 \n"
" VMSR FPSCR,R1 "
: : : "cc", "r1", "r2"
);
}
#endif /* __CMSIS_ARMCLANG_H */

View File

@ -1,213 +0,0 @@
/**************************************************************************//**
* @file cmsis_compiler.h
* @brief CMSIS compiler specific macros, functions, instructions
* @version V1.0.2
* @date 10. January 2018
******************************************************************************/
/*
* Copyright (c) 2009-2018 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __CMSIS_COMPILER_H
#define __CMSIS_COMPILER_H
#include <stdint.h>
/*
* Arm Compiler 4/5
*/
#if defined ( __CC_ARM )
#include "cmsis_armcc.h"
/*
* Arm Compiler 6 (armclang)
*/
#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
#include "cmsis_armclang.h"
/*
* GNU Compiler
*/
#elif defined ( __GNUC__ )
#include "cmsis_gcc.h"
/*
* IAR Compiler
*/
#elif defined ( __ICCARM__ )
#include "cmsis_iccarm.h"
/*
* TI Arm Compiler
*/
#elif defined ( __TI_ARM__ )
#include <cmsis_ccs.h>
#ifndef __ASM
#define __ASM __asm
#endif
#ifndef __INLINE
#define __INLINE inline
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static inline
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static inline
#endif
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE __STATIC_INLINE
#endif
#ifndef __NO_RETURN
#define __NO_RETURN __attribute__((noreturn))
#endif
#ifndef CMSIS_DEPRECATED
#define CMSIS_DEPRECATED __attribute__((deprecated))
#endif
#ifndef __USED
#define __USED __attribute__((used))
#endif
#ifndef __WEAK
#define __WEAK __attribute__((weak))
#endif
#ifndef __UNALIGNED_UINT32
struct __attribute__((packed)) T_UINT32 { uint32_t v; };
#define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v)
#endif
#ifndef __ALIGNED
#define __ALIGNED(x) __attribute__((aligned(x)))
#endif
#ifndef __PACKED
#define __PACKED __attribute__((packed))
#endif
#ifndef __COMPILER_BARRIER
#warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
#define __COMPILER_BARRIER() (void)0
#endif
/*
* TASKING Compiler
*/
#elif defined ( __TASKING__ )
/*
* The CMSIS functions have been implemented as intrinsics in the compiler.
* Please use "carm -?i" to get an up to date list of all intrinsics,
* Including the CMSIS ones.
*/
#ifndef __ASM
#define __ASM __asm
#endif
#ifndef __INLINE
#define __INLINE inline
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static inline
#endif
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE __STATIC_INLINE
#endif
#ifndef __NO_RETURN
#define __NO_RETURN __attribute__((noreturn))
#endif
#ifndef CMSIS_DEPRECATED
#define CMSIS_DEPRECATED __attribute__((deprecated))
#endif
#ifndef __USED
#define __USED __attribute__((used))
#endif
#ifndef __WEAK
#define __WEAK __attribute__((weak))
#endif
#ifndef __UNALIGNED_UINT32
struct __packed__ T_UINT32 { uint32_t v; };
#define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v)
#endif
#ifndef __ALIGNED
#define __ALIGNED(x) __align(x)
#endif
#ifndef __PACKED
#define __PACKED __packed__
#endif
#ifndef __COMPILER_BARRIER
#warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
#define __COMPILER_BARRIER() (void)0
#endif
/*
* COSMIC Compiler
*/
#elif defined ( __CSMC__ )
#include <cmsis_csm.h>
#ifndef __ASM
#define __ASM _asm
#endif
#ifndef __INLINE
#define __INLINE inline
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static inline
#endif
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE __STATIC_INLINE
#endif
#ifndef __NO_RETURN
// NO RETURN is automatically detected hence no warning here
#define __NO_RETURN
#endif
#ifndef __USED
#warning No compiler specific solution for __USED. __USED is ignored.
#define __USED
#endif
#ifndef CMSIS_DEPRECATED
#warning No compiler specific solution for CMSIS_DEPRECATED. CMSIS_DEPRECATED is ignored.
#define CMSIS_DEPRECATED
#endif
#ifndef __WEAK
#define __WEAK __weak
#endif
#ifndef __UNALIGNED_UINT32
@packed struct T_UINT32 { uint32_t v; };
#define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v)
#endif
#ifndef __ALIGNED
#warning No compiler specific solution for __ALIGNED. __ALIGNED is ignored.
#define __ALIGNED(x)
#endif
#ifndef __PACKED
#define __PACKED @packed
#endif
#ifndef __COMPILER_BARRIER
#warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
#define __COMPILER_BARRIER() (void)0
#endif
#else
#error Unknown compiler.
#endif
#endif /* __CMSIS_COMPILER_H */

View File

@ -1,514 +0,0 @@
/**************************************************************************//**
* @file cmsis_cp15.h
* @brief CMSIS compiler specific macros, functions, instructions
* @version V1.0.1
* @date 07. Sep 2017
******************************************************************************/
/*
* Copyright (c) 2009-2017 ARM Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#if defined ( __ICCARM__ )
#pragma system_include /* treat file as system include file for MISRA check */
#elif defined (__clang__)
#pragma clang system_header /* treat file as system include file */
#endif
#ifndef __CMSIS_CP15_H
#define __CMSIS_CP15_H
/** \brief Get ACTLR
\return Auxiliary Control register value
*/
__STATIC_FORCEINLINE uint32_t __get_ACTLR(void)
{
uint32_t result;
__get_CP(15, 0, result, 1, 0, 1);
return(result);
}
/** \brief Set ACTLR
\param [in] actlr Auxiliary Control value to set
*/
__STATIC_FORCEINLINE void __set_ACTLR(uint32_t actlr)
{
__set_CP(15, 0, actlr, 1, 0, 1);
}
/** \brief Get CPACR
\return Coprocessor Access Control register value
*/
__STATIC_FORCEINLINE uint32_t __get_CPACR(void)
{
uint32_t result;
__get_CP(15, 0, result, 1, 0, 2);
return result;
}
/** \brief Set CPACR
\param [in] cpacr Coprocessor Access Control value to set
*/
__STATIC_FORCEINLINE void __set_CPACR(uint32_t cpacr)
{
__set_CP(15, 0, cpacr, 1, 0, 2);
}
/** \brief Get DFSR
\return Data Fault Status Register value
*/
__STATIC_FORCEINLINE uint32_t __get_DFSR(void)
{
uint32_t result;
__get_CP(15, 0, result, 5, 0, 0);
return result;
}
/** \brief Set DFSR
\param [in] dfsr Data Fault Status value to set
*/
__STATIC_FORCEINLINE void __set_DFSR(uint32_t dfsr)
{
__set_CP(15, 0, dfsr, 5, 0, 0);
}
/** \brief Get IFSR
\return Instruction Fault Status Register value
*/
__STATIC_FORCEINLINE uint32_t __get_IFSR(void)
{
uint32_t result;
__get_CP(15, 0, result, 5, 0, 1);
return result;
}
/** \brief Set IFSR
\param [in] ifsr Instruction Fault Status value to set
*/
__STATIC_FORCEINLINE void __set_IFSR(uint32_t ifsr)
{
__set_CP(15, 0, ifsr, 5, 0, 1);
}
/** \brief Get ISR
\return Interrupt Status Register value
*/
__STATIC_FORCEINLINE uint32_t __get_ISR(void)
{
uint32_t result;
__get_CP(15, 0, result, 12, 1, 0);
return result;
}
/** \brief Get CBAR
\return Configuration Base Address register value
*/
__STATIC_FORCEINLINE uint32_t __get_CBAR(void)
{
uint32_t result;
__get_CP(15, 4, result, 15, 0, 0);
return result;
}
/** \brief Get TTBR0
This function returns the value of the Translation Table Base Register 0.
\return Translation Table Base Register 0 value
*/
__STATIC_FORCEINLINE uint32_t __get_TTBR0(void)
{
uint32_t result;
__get_CP(15, 0, result, 2, 0, 0);
return result;
}
/** \brief Set TTBR0
This function assigns the given value to the Translation Table Base Register 0.
\param [in] ttbr0 Translation Table Base Register 0 value to set
*/
__STATIC_FORCEINLINE void __set_TTBR0(uint32_t ttbr0)
{
__set_CP(15, 0, ttbr0, 2, 0, 0);
}
/** \brief Get DACR
This function returns the value of the Domain Access Control Register.
\return Domain Access Control Register value
*/
__STATIC_FORCEINLINE uint32_t __get_DACR(void)
{
uint32_t result;
__get_CP(15, 0, result, 3, 0, 0);
return result;
}
/** \brief Set DACR
This function assigns the given value to the Domain Access Control Register.
\param [in] dacr Domain Access Control Register value to set
*/
__STATIC_FORCEINLINE void __set_DACR(uint32_t dacr)
{
__set_CP(15, 0, dacr, 3, 0, 0);
}
/** \brief Set SCTLR
This function assigns the given value to the System Control Register.
\param [in] sctlr System Control Register value to set
*/
__STATIC_FORCEINLINE void __set_SCTLR(uint32_t sctlr)
{
__set_CP(15, 0, sctlr, 1, 0, 0);
}
/** \brief Get SCTLR
\return System Control Register value
*/
__STATIC_FORCEINLINE uint32_t __get_SCTLR(void)
{
uint32_t result;
__get_CP(15, 0, result, 1, 0, 0);
return result;
}
/** \brief Set ACTRL
\param [in] actrl Auxiliary Control Register value to set
*/
__STATIC_FORCEINLINE void __set_ACTRL(uint32_t actrl)
{
__set_CP(15, 0, actrl, 1, 0, 1);
}
/** \brief Get ACTRL
\return Auxiliary Control Register value
*/
__STATIC_FORCEINLINE uint32_t __get_ACTRL(void)
{
uint32_t result;
__get_CP(15, 0, result, 1, 0, 1);
return result;
}
/** \brief Get MPIDR
This function returns the value of the Multiprocessor Affinity Register.
\return Multiprocessor Affinity Register value
*/
__STATIC_FORCEINLINE uint32_t __get_MPIDR(void)
{
uint32_t result;
__get_CP(15, 0, result, 0, 0, 5);
return result;
}
/** \brief Get VBAR
This function returns the value of the Vector Base Address Register.
\return Vector Base Address Register
*/
__STATIC_FORCEINLINE uint32_t __get_VBAR(void)
{
uint32_t result;
__get_CP(15, 0, result, 12, 0, 0);
return result;
}
/** \brief Set VBAR
This function assigns the given value to the Vector Base Address Register.
\param [in] vbar Vector Base Address Register value to set
*/
__STATIC_FORCEINLINE void __set_VBAR(uint32_t vbar)
{
__set_CP(15, 0, vbar, 12, 0, 0);
}
/** \brief Get MVBAR
This function returns the value of the Monitor Vector Base Address Register.
\return Monitor Vector Base Address Register
*/
__STATIC_FORCEINLINE uint32_t __get_MVBAR(void)
{
uint32_t result;
__get_CP(15, 0, result, 12, 0, 1);
return result;
}
/** \brief Set MVBAR
This function assigns the given value to the Monitor Vector Base Address Register.
\param [in] mvbar Monitor Vector Base Address Register value to set
*/
__STATIC_FORCEINLINE void __set_MVBAR(uint32_t mvbar)
{
__set_CP(15, 0, mvbar, 12, 0, 1);
}
#if (defined(__CORTEX_A) && (__CORTEX_A == 7U) && \
defined(__TIM_PRESENT) && (__TIM_PRESENT == 1U)) || \
defined(DOXYGEN)
/** \brief Set CNTFRQ
This function assigns the given value to PL1 Physical Timer Counter Frequency Register (CNTFRQ).
\param [in] value CNTFRQ Register value to set
*/
__STATIC_FORCEINLINE void __set_CNTFRQ(uint32_t value)
{
__set_CP(15, 0, value, 14, 0, 0);
}
/** \brief Get CNTFRQ
This function returns the value of the PL1 Physical Timer Counter Frequency Register (CNTFRQ).
\return CNTFRQ Register value
*/
__STATIC_FORCEINLINE uint32_t __get_CNTFRQ(void)
{
uint32_t result;
__get_CP(15, 0, result, 14, 0 , 0);
return result;
}
/** \brief Set CNTP_TVAL
This function assigns the given value to PL1 Physical Timer Value Register (CNTP_TVAL).
\param [in] value CNTP_TVAL Register value to set
*/
__STATIC_FORCEINLINE void __set_CNTP_TVAL(uint32_t value)
{
__set_CP(15, 0, value, 14, 2, 0);
}
/** \brief Get CNTP_TVAL
This function returns the value of the PL1 Physical Timer Value Register (CNTP_TVAL).
\return CNTP_TVAL Register value
*/
__STATIC_FORCEINLINE uint32_t __get_CNTP_TVAL(void)
{
uint32_t result;
__get_CP(15, 0, result, 14, 2, 0);
return result;
}
/** \brief Get CNTPCT
This function returns the value of the 64 bits PL1 Physical Count Register (CNTPCT).
\return CNTPCT Register value
*/
__STATIC_FORCEINLINE uint64_t __get_CNTPCT(void)
{
uint64_t result;
__get_CP64(15, 0, result, 14);
return result;
}
/** \brief Set CNTP_CVAL
This function assigns the given value to 64bits PL1 Physical Timer CompareValue Register (CNTP_CVAL).
\param [in] value CNTP_CVAL Register value to set
*/
__STATIC_FORCEINLINE void __set_CNTP_CVAL(uint64_t value)
{
__set_CP64(15, 2, value, 14);
}
/** \brief Get CNTP_CVAL
This function returns the value of the 64 bits PL1 Physical Timer CompareValue Register (CNTP_CVAL).
\return CNTP_CVAL Register value
*/
__STATIC_FORCEINLINE uint64_t __get_CNTP_CVAL(void)
{
uint64_t result;
__get_CP64(15, 2, result, 14);
return result;
}
/** \brief Set CNTP_CTL
This function assigns the given value to PL1 Physical Timer Control Register (CNTP_CTL).
\param [in] value CNTP_CTL Register value to set
*/
__STATIC_FORCEINLINE void __set_CNTP_CTL(uint32_t value)
{
__set_CP(15, 0, value, 14, 2, 1);
}
/** \brief Get CNTP_CTL register
\return CNTP_CTL Register value
*/
__STATIC_FORCEINLINE uint32_t __get_CNTP_CTL(void)
{
uint32_t result;
__get_CP(15, 0, result, 14, 2, 1);
return result;
}
#endif
/** \brief Set TLBIALL
TLB Invalidate All
*/
__STATIC_FORCEINLINE void __set_TLBIALL(uint32_t value)
{
__set_CP(15, 0, value, 8, 7, 0);
}
/** \brief Set BPIALL.
Branch Predictor Invalidate All
*/
__STATIC_FORCEINLINE void __set_BPIALL(uint32_t value)
{
__set_CP(15, 0, value, 7, 5, 6);
}
/** \brief Set ICIALLU
Instruction Cache Invalidate All
*/
__STATIC_FORCEINLINE void __set_ICIALLU(uint32_t value)
{
__set_CP(15, 0, value, 7, 5, 0);
}
/** \brief Set DCCMVAC
Data cache clean
*/
__STATIC_FORCEINLINE void __set_DCCMVAC(uint32_t value)
{
__set_CP(15, 0, value, 7, 10, 1);
}
/** \brief Set DCIMVAC
Data cache invalidate
*/
__STATIC_FORCEINLINE void __set_DCIMVAC(uint32_t value)
{
__set_CP(15, 0, value, 7, 6, 1);
}
/** \brief Set DCCIMVAC
Data cache clean and invalidate
*/
__STATIC_FORCEINLINE void __set_DCCIMVAC(uint32_t value)
{
__set_CP(15, 0, value, 7, 14, 1);
}
/** \brief Set CSSELR
*/
__STATIC_FORCEINLINE void __set_CSSELR(uint32_t value)
{
// __ASM volatile("MCR p15, 2, %0, c0, c0, 0" : : "r"(value) : "memory");
__set_CP(15, 2, value, 0, 0, 0);
}
/** \brief Get CSSELR
\return CSSELR Register value
*/
__STATIC_FORCEINLINE uint32_t __get_CSSELR(void)
{
uint32_t result;
// __ASM volatile("MRC p15, 2, %0, c0, c0, 0" : "=r"(result) : : "memory");
__get_CP(15, 2, result, 0, 0, 0);
return result;
}
/** \brief Set CCSIDR
\deprecated CCSIDR itself is read-only. Use __set_CSSELR to select cache level instead.
*/
CMSIS_DEPRECATED
__STATIC_FORCEINLINE void __set_CCSIDR(uint32_t value)
{
__set_CSSELR(value);
}
/** \brief Get CCSIDR
\return CCSIDR Register value
*/
__STATIC_FORCEINLINE uint32_t __get_CCSIDR(void)
{
uint32_t result;
// __ASM volatile("MRC p15, 1, %0, c0, c0, 0" : "=r"(result) : : "memory");
__get_CP(15, 1, result, 0, 0, 0);
return result;
}
/** \brief Get CLIDR
\return CLIDR Register value
*/
__STATIC_FORCEINLINE uint32_t __get_CLIDR(void)
{
uint32_t result;
// __ASM volatile("MRC p15, 1, %0, c0, c0, 1" : "=r"(result) : : "memory");
__get_CP(15, 1, result, 0, 0, 1);
return result;
}
/** \brief Set DCISW
*/
__STATIC_FORCEINLINE void __set_DCISW(uint32_t value)
{
// __ASM volatile("MCR p15, 0, %0, c7, c6, 2" : : "r"(value) : "memory")
__set_CP(15, 0, value, 7, 6, 2);
}
/** \brief Set DCCSW
*/
__STATIC_FORCEINLINE void __set_DCCSW(uint32_t value)
{
// __ASM volatile("MCR p15, 0, %0, c7, c10, 2" : : "r"(value) : "memory")
__set_CP(15, 0, value, 7, 10, 2);
}
/** \brief Set DCCISW
*/
__STATIC_FORCEINLINE void __set_DCCISW(uint32_t value)
{
// __ASM volatile("MCR p15, 0, %0, c7, c14, 2" : : "r"(value) : "memory")
__set_CP(15, 0, value, 7, 14, 2);
}
#endif

View File

@ -1,917 +0,0 @@
/**************************************************************************//**
* @file cmsis_gcc.h
* @brief CMSIS compiler specific macros, functions, instructions
* @version V1.3.2
* @date 24. March 2022
******************************************************************************/
/*
* Copyright (c) 2009-2022 Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __CMSIS_GCC_H
#define __CMSIS_GCC_H
/* ignore some GCC warnings */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsign-conversion"
#pragma GCC diagnostic ignored "-Wconversion"
#pragma GCC diagnostic ignored "-Wunused-parameter"
/* Fallback for __has_builtin */
#ifndef __has_builtin
#define __has_builtin(x) (0)
#endif
/* CMSIS compiler specific defines */
#ifndef __ASM
#define __ASM __asm
#endif
#ifndef __INLINE
#define __INLINE inline
#endif
#ifndef __FORCEINLINE
#define __FORCEINLINE __attribute__((always_inline))
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static inline
#endif
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE __attribute__((always_inline)) static inline
#endif
#ifndef __NO_RETURN
#define __NO_RETURN __attribute__((__noreturn__))
#endif
#ifndef CMSIS_DEPRECATED
#define CMSIS_DEPRECATED __attribute__((deprecated))
#endif
#ifndef __USED
#define __USED __attribute__((used))
#endif
#ifndef __WEAK
#define __WEAK __attribute__((weak))
#endif
#ifndef __PACKED
#define __PACKED __attribute__((packed, aligned(1)))
#endif
#ifndef __PACKED_STRUCT
#define __PACKED_STRUCT struct __attribute__((packed, aligned(1)))
#endif
#ifndef __UNALIGNED_UINT16_WRITE
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpacked"
/*lint -esym(9058, T_UINT16_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_WRITE */
__PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
#pragma GCC diagnostic pop
#define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
#endif
#ifndef __UNALIGNED_UINT16_READ
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpacked"
/*lint -esym(9058, T_UINT16_READ)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT16_READ */
__PACKED_STRUCT T_UINT16_READ { uint16_t v; };
#pragma GCC diagnostic pop
#define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v)
#endif
#ifndef __UNALIGNED_UINT32_WRITE
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpacked"
/*lint -esym(9058, T_UINT32_WRITE)*/ /* disable MISRA 2012 Rule 2.4 for T_UINT32_WRITE */
__PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
#pragma GCC diagnostic pop
#define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
#endif
#ifndef __UNALIGNED_UINT32_READ
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpacked"
__PACKED_STRUCT T_UINT32_READ { uint32_t v; };
#pragma GCC diagnostic pop
#define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v)
#endif
#ifndef __ALIGNED
#define __ALIGNED(x) __attribute__((aligned(x)))
#endif
#ifndef __COMPILER_BARRIER
#define __COMPILER_BARRIER() __ASM volatile("":::"memory")
#endif
__STATIC_FORCEINLINE uint32_t __QSUB16(uint32_t op1, uint32_t op2)
{
uint32_t result;
__ASM volatile ("qsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __QSUB8(uint32_t op1, uint32_t op2)
{
uint32_t result;
__ASM volatile ("qsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __QADD16(uint32_t op1, uint32_t op2)
{
uint32_t result;
__ASM volatile ("qadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __QADD8(uint32_t op1, uint32_t op2)
{
uint32_t result;
__ASM volatile ("qadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE int32_t __QADD( int32_t op1, int32_t op2)
{
int32_t result;
__ASM volatile ("qadd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __QSAX(uint32_t op1, uint32_t op2)
{
uint32_t result;
__ASM ("qsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __SHSAX(uint32_t op1, uint32_t op2)
{
uint32_t result;
__ASM ("shsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t acc)
{
union llreg_u{
uint32_t w32[2];
uint64_t w64;
} llr;
llr.w64 = acc;
#ifndef __ARMEB__ /* Little endian */
__ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
#else /* Big endian */
__ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
#endif
return(llr.w64);
}
__STATIC_FORCEINLINE int32_t __QSUB( int32_t op1, int32_t op2)
{
int32_t result;
__ASM volatile ("qsub %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __SXTB16(uint32_t op1)
{
uint32_t result;
__ASM ("sxtb16 %0, %1" : "=r" (result) : "r" (op1));
return(result);
}
__STATIC_FORCEINLINE uint32_t __SMUAD (uint32_t op1, uint32_t op2)
{
uint32_t result;
__ASM volatile ("smuad %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
#define __PKHBT(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0x0000FFFFUL) | \
((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL) )
#define __PKHTB(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0xFFFF0000UL) | \
((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL) )
__STATIC_FORCEINLINE uint32_t __SMLAD (uint32_t op1, uint32_t op2, uint32_t op3)
{
uint32_t result;
__ASM volatile ("smlad %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __SMUADX (uint32_t op1, uint32_t op2)
{
uint32_t result;
__ASM volatile ("smuadx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __SMLADX (uint32_t op1, uint32_t op2, uint32_t op3)
{
uint32_t result;
__ASM volatile ("smladx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
return(result);
}
__STATIC_FORCEINLINE uint64_t __SMLALDX (uint32_t op1, uint32_t op2, uint64_t acc)
{
union llreg_u{
uint32_t w32[2];
uint64_t w64;
} llr;
llr.w64 = acc;
#ifndef __ARMEB__ /* Little endian */
__ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
#else /* Big endian */
__ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
#endif
return(llr.w64);
}
__STATIC_FORCEINLINE int32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3)
{
int32_t result;
__ASM volatile ("smmla %0, %1, %2, %3" : "=r" (result): "r" (op1), "r" (op2), "r" (op3) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __SMUSD (uint32_t op1, uint32_t op2)
{
uint32_t result;
__ASM volatile ("smusd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __SMUSDX (uint32_t op1, uint32_t op2)
{
uint32_t result;
__ASM volatile ("smusdx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __QASX(uint32_t op1, uint32_t op2)
{
uint32_t result;
__ASM ("qasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __SHADD16(uint32_t op1, uint32_t op2)
{
uint32_t result;
__ASM ("shadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __SHSUB16(uint32_t op1, uint32_t op2)
{
uint32_t result;
__ASM ("shsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __SHASX(uint32_t op1, uint32_t op2)
{
uint32_t result;
__ASM ("shasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
return(result);
}
__STATIC_FORCEINLINE uint32_t __SMLSDX (uint32_t op1, uint32_t op2, uint32_t op3)
{
uint32_t result;
__ASM volatile ("smlsdx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
return(result);
}
/* ########################## Core Instruction Access ######################### */
/**
\brief No Operation
*/
#define __NOP() __ASM volatile ("nop")
/**
\brief Wait For Interrupt
*/
#define __WFI() __ASM volatile ("wfi":::"memory")
/**
\brief Wait For Event
*/
#define __WFE() __ASM volatile ("wfe":::"memory")
/**
\brief Send Event
*/
#define __SEV() __ASM volatile ("sev")
/**
\brief Instruction Synchronization Barrier
\details Instruction Synchronization Barrier flushes the pipeline in the processor,
so that all instructions following the ISB are fetched from cache or memory,
after the instruction has been completed.
*/
__STATIC_FORCEINLINE void __ISB(void)
{
__ASM volatile ("isb 0xF":::"memory");
}
/**
\brief Data Synchronization Barrier
\details Acts as a special kind of Data Memory Barrier.
It completes when all explicit memory accesses before this instruction complete.
*/
__STATIC_FORCEINLINE void __DSB(void)
{
__ASM volatile ("dsb 0xF":::"memory");
}
/**
\brief Data Memory Barrier
\details Ensures the apparent order of the explicit memory operations before
and after the instruction, without ensuring their completion.
*/
__STATIC_FORCEINLINE void __DMB(void)
{
__ASM volatile ("dmb 0xF":::"memory");
}
/**
\brief Reverse byte order (32 bit)
\details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
\param [in] value Value to reverse
\return Reversed value
*/
__STATIC_FORCEINLINE uint32_t __REV(uint32_t value)
{
#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
return __builtin_bswap32(value);
#else
uint32_t result;
__ASM ("rev %0, %1" : "=r" (result) : "r" (value) );
return result;
#endif
}
/**
\brief Reverse byte order (16 bit)
\details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
\param [in] value Value to reverse
\return Reversed value
*/
__STATIC_FORCEINLINE uint32_t __REV16(uint32_t value)
{
uint32_t result;
__ASM ("rev16 %0, %1" : "=r" (result) : "r" (value));
return result;
}
/**
\brief Reverse byte order (16 bit)
\details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
\param [in] value Value to reverse
\return Reversed value
*/
__STATIC_FORCEINLINE int16_t __REVSH(int16_t value)
{
#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
return (int16_t)__builtin_bswap16(value);
#else
int16_t result;
__ASM ("revsh %0, %1" : "=r" (result) : "r" (value) );
return result;
#endif
}
/**
\brief Rotate Right in unsigned value (32 bit)
\details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
\param [in] op1 Value to rotate
\param [in] op2 Number of Bits to rotate
\return Rotated value
*/
__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
{
op2 %= 32U;
if (op2 == 0U)
{
return op1;
}
return (op1 >> op2) | (op1 << (32U - op2));
}
/**
\brief Breakpoint
\param [in] value is ignored by the processor.
If required, a debugger can use it to store additional information about the breakpoint.
*/
#define __BKPT(value) __ASM volatile ("bkpt "#value)
/**
\brief Reverse bit order of value
\details Reverses the bit order of the given value.
\param [in] value Value to reverse
\return Reversed value
*/
__STATIC_FORCEINLINE uint32_t __RBIT(uint32_t value)
{
uint32_t result;
__ASM ("rbit %0, %1" : "=r" (result) : "r" (value) );
return result;
}
/**
\brief Count leading zeros
\param [in] value Value to count the leading zeros
\return number of leading zeros in value
*/
__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t value)
{
/* Even though __builtin_clz produces a CLZ instruction on ARM, formally
__builtin_clz(0) is undefined behaviour, so handle this case specially.
This guarantees ARM-compatible results if happening to compile on a non-ARM
target, and ensures the compiler doesn't decide to activate any
optimisations using the logic "value was passed to __builtin_clz, so it
is non-zero".
ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a
single CLZ instruction.
*/
if (value == 0U)
{
return 32U;
}
return __builtin_clz(value);
}
/**
\brief LDR Exclusive (8 bit)
\details Executes a exclusive LDR instruction for 8 bit value.
\param [in] ptr Pointer to data
\return value of type uint8_t at (*ptr)
*/
__STATIC_FORCEINLINE uint8_t __LDREXB(volatile uint8_t *addr)
{
uint32_t result;
#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
__ASM volatile ("ldrexb %0, %1" : "=r" (result) : "Q" (*addr) );
#else
/* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
accepted by assembler. So has to use following less efficient pattern.
*/
__ASM volatile ("ldrexb %0, [%1]" : "=r" (result) : "r" (addr) : "memory" );
#endif
return ((uint8_t) result); /* Add explicit type cast here */
}
/**
\brief LDR Exclusive (16 bit)
\details Executes a exclusive LDR instruction for 16 bit values.
\param [in] ptr Pointer to data
\return value of type uint16_t at (*ptr)
*/
__STATIC_FORCEINLINE uint16_t __LDREXH(volatile uint16_t *addr)
{
uint32_t result;
#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
__ASM volatile ("ldrexh %0, %1" : "=r" (result) : "Q" (*addr) );
#else
/* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
accepted by assembler. So has to use following less efficient pattern.
*/
__ASM volatile ("ldrexh %0, [%1]" : "=r" (result) : "r" (addr) : "memory" );
#endif
return ((uint16_t) result); /* Add explicit type cast here */
}
/**
\brief LDR Exclusive (32 bit)
\details Executes a exclusive LDR instruction for 32 bit values.
\param [in] ptr Pointer to data
\return value of type uint32_t at (*ptr)
*/
__STATIC_FORCEINLINE uint32_t __LDREXW(volatile uint32_t *addr)
{
uint32_t result;
__ASM volatile ("ldrex %0, %1" : "=r" (result) : "Q" (*addr) );
return(result);
}
/**
\brief STR Exclusive (8 bit)
\details Executes a exclusive STR instruction for 8 bit values.
\param [in] value Value to store
\param [in] ptr Pointer to location
\return 0 Function succeeded
\return 1 Function failed
*/
__STATIC_FORCEINLINE uint32_t __STREXB(uint8_t value, volatile uint8_t *addr)
{
uint32_t result;
__ASM volatile ("strexb %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) );
return(result);
}
/**
\brief STR Exclusive (16 bit)
\details Executes a exclusive STR instruction for 16 bit values.
\param [in] value Value to store
\param [in] ptr Pointer to location
\return 0 Function succeeded
\return 1 Function failed
*/
__STATIC_FORCEINLINE uint32_t __STREXH(uint16_t value, volatile uint16_t *addr)
{
uint32_t result;
__ASM volatile ("strexh %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) );
return(result);
}
/**
\brief STR Exclusive (32 bit)
\details Executes a exclusive STR instruction for 32 bit values.
\param [in] value Value to store
\param [in] ptr Pointer to location
\return 0 Function succeeded
\return 1 Function failed
*/
__STATIC_FORCEINLINE uint32_t __STREXW(uint32_t value, volatile uint32_t *addr)
{
uint32_t result;
__ASM volatile ("strex %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" (value) );
return(result);
}
/**
\brief Remove the exclusive lock
\details Removes the exclusive lock which is created by LDREX.
*/
__STATIC_FORCEINLINE void __CLREX(void)
{
__ASM volatile ("clrex" ::: "memory");
}
/**
\brief Signed Saturate
\details Saturates a signed value.
\param [in] value Value to be saturated
\param [in] sat Bit position to saturate to (1..32)
\return Saturated value
*/
#define __SSAT(ARG1, ARG2) \
__extension__ \
({ \
int32_t __RES, __ARG1 = (ARG1); \
__ASM volatile ("ssat %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) : "cc" ); \
__RES; \
})
/**
\brief Unsigned Saturate
\details Saturates an unsigned value.
\param [in] value Value to be saturated
\param [in] sat Bit position to saturate to (0..31)
\return Saturated value
*/
#define __USAT(ARG1, ARG2) \
__extension__ \
({ \
uint32_t __RES, __ARG1 = (ARG1); \
__ASM volatile ("usat %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) : "cc" ); \
__RES; \
})
/* ########################### Core Function Access ########################### */
/**
\brief Enable IRQ Interrupts
\details Enables IRQ interrupts by clearing the I-bit in the CPSR.
Can only be executed in Privileged modes.
*/
__STATIC_FORCEINLINE void __enable_irq(void)
{
__ASM volatile ("cpsie i" : : : "memory");
}
/**
\brief Disable IRQ Interrupts
\details Disables IRQ interrupts by setting the I-bit in the CPSR.
Can only be executed in Privileged modes.
*/
__STATIC_FORCEINLINE void __disable_irq(void)
{
__ASM volatile ("cpsid i" : : : "memory");
}
/**
\brief Enable FIQ
\details Enables FIQ interrupts by clearing the F-bit in the CPSR.
Can only be executed in Privileged modes.
*/
__STATIC_FORCEINLINE void __enable_fault_irq(void)
{
__ASM volatile ("cpsie f" : : : "memory");
}
/**
\brief Disable FIQ
\details Disables FIQ interrupts by setting the F-bit in the CPSR.
Can only be executed in Privileged modes.
*/
__STATIC_FORCEINLINE void __disable_fault_irq(void)
{
__ASM volatile ("cpsid f" : : : "memory");
}
/**
\brief Get FPSCR
\details Returns the current value of the Floating Point Status/Control register.
\return Floating Point Status/Control register value
*/
__STATIC_FORCEINLINE uint32_t __get_FPSCR(void)
{
#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
(defined (__FPU_USED ) && (__FPU_USED == 1U)) )
#if __has_builtin(__builtin_arm_get_fpscr)
// Re-enable using built-in when GCC has been fixed
// || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2)
/* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */
return __builtin_arm_get_fpscr();
#else
uint32_t result;
__ASM volatile ("VMRS %0, fpscr" : "=r" (result) );
return(result);
#endif
#else
return(0U);
#endif
}
/**
\brief Set FPSCR
\details Assigns the given value to the Floating Point Status/Control register.
\param [in] fpscr Floating Point Status/Control value to set
*/
__STATIC_FORCEINLINE void __set_FPSCR(uint32_t fpscr)
{
#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
(defined (__FPU_USED ) && (__FPU_USED == 1U)) )
#if __has_builtin(__builtin_arm_set_fpscr)
// Re-enable using built-in when GCC has been fixed
// || (__GNUC__ > 7) || (__GNUC__ == 7 && __GNUC_MINOR__ >= 2)
/* see https://gcc.gnu.org/ml/gcc-patches/2017-04/msg00443.html */
__builtin_arm_set_fpscr(fpscr);
#else
__ASM volatile ("VMSR fpscr, %0" : : "r" (fpscr) : "vfpcc", "memory");
#endif
#else
(void)fpscr;
#endif
}
/** \brief Get CPSR Register
\return CPSR Register value
*/
__STATIC_FORCEINLINE uint32_t __get_CPSR(void)
{
uint32_t result;
__ASM volatile("MRS %0, cpsr" : "=r" (result) );
return(result);
}
/** \brief Set CPSR Register
\param [in] cpsr CPSR value to set
*/
__STATIC_FORCEINLINE void __set_CPSR(uint32_t cpsr)
{
__ASM volatile ("MSR cpsr, %0" : : "r" (cpsr) : "cc", "memory");
}
/** \brief Get Mode
\return Processor Mode
*/
__STATIC_FORCEINLINE uint32_t __get_mode(void)
{
return (__get_CPSR() & 0x1FU);
}
/** \brief Set Mode
\param [in] mode Mode value to set
*/
__STATIC_FORCEINLINE void __set_mode(uint32_t mode)
{
__ASM volatile("MSR cpsr_c, %0" : : "r" (mode) : "memory");
}
/** \brief Get Stack Pointer
\return Stack Pointer value
*/
__STATIC_FORCEINLINE uint32_t __get_SP(void)
{
uint32_t result;
__ASM volatile("MOV %0, sp" : "=r" (result) : : "memory");
return result;
}
/** \brief Set Stack Pointer
\param [in] stack Stack Pointer value to set
*/
__STATIC_FORCEINLINE void __set_SP(uint32_t stack)
{
__ASM volatile("MOV sp, %0" : : "r" (stack) : "memory");
}
/** \brief Get USR/SYS Stack Pointer
\return USR/SYS Stack Pointer value
*/
__STATIC_FORCEINLINE uint32_t __get_SP_usr(void)
{
uint32_t cpsr = __get_CPSR();
uint32_t result;
__ASM volatile(
"CPS #0x1F \n"
"MOV %0, sp " : "=r"(result) : : "memory"
);
__set_CPSR(cpsr);
__ISB();
return result;
}
/** \brief Set USR/SYS Stack Pointer
\param [in] topOfProcStack USR/SYS Stack Pointer value to set
*/
__STATIC_FORCEINLINE void __set_SP_usr(uint32_t topOfProcStack)
{
uint32_t cpsr = __get_CPSR();
__ASM volatile(
"CPS #0x1F \n"
"MOV sp, %0 " : : "r" (topOfProcStack) : "memory"
);
__set_CPSR(cpsr);
__ISB();
}
/** \brief Get FPEXC
\return Floating Point Exception Control register value
*/
__STATIC_FORCEINLINE uint32_t __get_FPEXC(void)
{
#if (__FPU_PRESENT == 1)
uint32_t result;
__ASM volatile("VMRS %0, fpexc" : "=r" (result) : : "memory");
return(result);
#else
return(0);
#endif
}
/** \brief Set FPEXC
\param [in] fpexc Floating Point Exception Control value to set
*/
__STATIC_FORCEINLINE void __set_FPEXC(uint32_t fpexc)
{
#if (__FPU_PRESENT == 1)
__ASM volatile ("VMSR fpexc, %0" : : "r" (fpexc) : "memory");
#endif
}
/*
* Include common core functions to access Coprocessor 15 registers
*/
#define __get_CP(cp, op1, Rt, CRn, CRm, op2) __ASM volatile("MRC p" # cp ", " # op1 ", %0, c" # CRn ", c" # CRm ", " # op2 : "=r" (Rt) : : "memory" )
#define __set_CP(cp, op1, Rt, CRn, CRm, op2) __ASM volatile("MCR p" # cp ", " # op1 ", %0, c" # CRn ", c" # CRm ", " # op2 : : "r" (Rt) : "memory" )
#define __get_CP64(cp, op1, Rt, CRm) __ASM volatile("MRRC p" # cp ", " # op1 ", %Q0, %R0, c" # CRm : "=r" (Rt) : : "memory" )
#define __set_CP64(cp, op1, Rt, CRm) __ASM volatile("MCRR p" # cp ", " # op1 ", %Q0, %R0, c" # CRm : : "r" (Rt) : "memory" )
#include "cmsis_cp15.h"
/** \brief Enable Floating Point Unit
Critical section, called from undef handler, so systick is disabled
*/
__STATIC_INLINE void __FPU_Enable(void)
{
__ASM volatile(
//Permit access to VFP/NEON, registers by modifying CPACR
" MRC p15,0,R1,c1,c0,2 \n"
" ORR R1,R1,#0x00F00000 \n"
" MCR p15,0,R1,c1,c0,2 \n"
//Ensure that subsequent instructions occur in the context of VFP/NEON access permitted
" ISB \n"
//Enable VFP/NEON
" VMRS R1,FPEXC \n"
" ORR R1,R1,#0x40000000 \n"
" VMSR FPEXC,R1 \n"
//Initialise VFP/NEON registers to 0
" MOV R2,#0 \n"
//Initialise D16 registers to 0
" VMOV D0, R2,R2 \n"
" VMOV D1, R2,R2 \n"
" VMOV D2, R2,R2 \n"
" VMOV D3, R2,R2 \n"
" VMOV D4, R2,R2 \n"
" VMOV D5, R2,R2 \n"
" VMOV D6, R2,R2 \n"
" VMOV D7, R2,R2 \n"
" VMOV D8, R2,R2 \n"
" VMOV D9, R2,R2 \n"
" VMOV D10,R2,R2 \n"
" VMOV D11,R2,R2 \n"
" VMOV D12,R2,R2 \n"
" VMOV D13,R2,R2 \n"
" VMOV D14,R2,R2 \n"
" VMOV D15,R2,R2 \n"
#if (defined(__ARM_NEON) && (__ARM_NEON == 1))
//Initialise D32 registers to 0
" VMOV D16,R2,R2 \n"
" VMOV D17,R2,R2 \n"
" VMOV D18,R2,R2 \n"
" VMOV D19,R2,R2 \n"
" VMOV D20,R2,R2 \n"
" VMOV D21,R2,R2 \n"
" VMOV D22,R2,R2 \n"
" VMOV D23,R2,R2 \n"
" VMOV D24,R2,R2 \n"
" VMOV D25,R2,R2 \n"
" VMOV D26,R2,R2 \n"
" VMOV D27,R2,R2 \n"
" VMOV D28,R2,R2 \n"
" VMOV D29,R2,R2 \n"
" VMOV D30,R2,R2 \n"
" VMOV D31,R2,R2 \n"
#endif
//Initialise FPSCR to a known state
" VMRS R1,FPSCR \n"
" LDR R2,=0x00086060 \n" //Mask off all bits that do not have to be preserved. Non-preserved bits can/should be zero.
" AND R1,R1,R2 \n"
" VMSR FPSCR,R1 "
: : : "cc", "r1", "r2"
);
}
#pragma GCC diagnostic pop
#endif /* __CMSIS_GCC_H */

View File

@ -1,573 +0,0 @@
/**************************************************************************//**
* @file cmsis_iccarm.h
* @brief CMSIS compiler ICCARM (IAR Compiler for Arm) header file
* @version V5.0.7
* @date 15. May 2019
******************************************************************************/
//------------------------------------------------------------------------------
//
// Copyright (c) 2017-2018 IAR Systems
// Copyright (c) 2018-2019 Arm Limited
//
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License")
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//------------------------------------------------------------------------------
#ifndef __CMSIS_ICCARM_H__
#define __CMSIS_ICCARM_H__
#ifndef __ICCARM__
#error This file should only be compiled by ICCARM
#endif
#pragma system_include
#define __IAR_FT _Pragma("inline=forced") __intrinsic
#if (__VER__ >= 8000000)
#define __ICCARM_V8 1
#else
#define __ICCARM_V8 0
#endif
#pragma language=extended
#ifndef __ALIGNED
#if __ICCARM_V8
#define __ALIGNED(x) __attribute__((aligned(x)))
#elif (__VER__ >= 7080000)
/* Needs IAR language extensions */
#define __ALIGNED(x) __attribute__((aligned(x)))
#else
#warning No compiler specific solution for __ALIGNED.__ALIGNED is ignored.
#define __ALIGNED(x)
#endif
#endif
/* Define compiler macros for CPU architecture, used in CMSIS 5.
*/
#if __ARM_ARCH_7A__
/* Macro already defined */
#else
#if defined(__ARM7A__)
#define __ARM_ARCH_7A__ 1
#endif
#endif
#ifndef __ASM
#define __ASM __asm
#endif
#ifndef __COMPILER_BARRIER
#define __COMPILER_BARRIER() __ASM volatile("":::"memory")
#endif
#ifndef __INLINE
#define __INLINE inline
#endif
#ifndef __NO_RETURN
#if __ICCARM_V8
#define __NO_RETURN __attribute__((__noreturn__))
#else
#define __NO_RETURN _Pragma("object_attribute=__noreturn")
#endif
#endif
#ifndef __PACKED
/* Needs IAR language extensions */
#if __ICCARM_V8
#define __PACKED __attribute__((packed, aligned(1)))
#else
#define __PACKED __packed
#endif
#endif
#ifndef __PACKED_STRUCT
/* Needs IAR language extensions */
#if __ICCARM_V8
#define __PACKED_STRUCT struct __attribute__((packed, aligned(1)))
#else
#define __PACKED_STRUCT __packed struct
#endif
#endif
#ifndef __PACKED_UNION
/* Needs IAR language extensions */
#if __ICCARM_V8
#define __PACKED_UNION union __attribute__((packed, aligned(1)))
#else
#define __PACKED_UNION __packed union
#endif
#endif
#ifndef __RESTRICT
#if __ICCARM_V8
#define __RESTRICT __restrict
#else
/* Needs IAR language extensions */
#define __RESTRICT restrict
#endif
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static inline
#endif
#ifndef __FORCEINLINE
#define __FORCEINLINE _Pragma("inline=forced")
#endif
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE __FORCEINLINE __STATIC_INLINE
#endif
#ifndef CMSIS_DEPRECATED
#define CMSIS_DEPRECATED __attribute__((deprecated))
#endif
#ifndef __UNALIGNED_UINT16_READ
#pragma language=save
#pragma language=extended
__IAR_FT uint16_t __iar_uint16_read(void const *ptr)
{
return *(__packed uint16_t*)(ptr);
}
#pragma language=restore
#define __UNALIGNED_UINT16_READ(PTR) __iar_uint16_read(PTR)
#endif
#ifndef __UNALIGNED_UINT16_WRITE
#pragma language=save
#pragma language=extended
__IAR_FT void __iar_uint16_write(void const *ptr, uint16_t val)
{
*(__packed uint16_t*)(ptr) = val;;
}
#pragma language=restore
#define __UNALIGNED_UINT16_WRITE(PTR,VAL) __iar_uint16_write(PTR,VAL)
#endif
#ifndef __UNALIGNED_UINT32_READ
#pragma language=save
#pragma language=extended
__IAR_FT uint32_t __iar_uint32_read(void const *ptr)
{
return *(__packed uint32_t*)(ptr);
}
#pragma language=restore
#define __UNALIGNED_UINT32_READ(PTR) __iar_uint32_read(PTR)
#endif
#ifndef __UNALIGNED_UINT32_WRITE
#pragma language=save
#pragma language=extended
__IAR_FT void __iar_uint32_write(void const *ptr, uint32_t val)
{
*(__packed uint32_t*)(ptr) = val;;
}
#pragma language=restore
#define __UNALIGNED_UINT32_WRITE(PTR,VAL) __iar_uint32_write(PTR,VAL)
#endif
#if 0
#ifndef __UNALIGNED_UINT32 /* deprecated */
#pragma language=save
#pragma language=extended
__packed struct __iar_u32 { uint32_t v; };
#pragma language=restore
#define __UNALIGNED_UINT32(PTR) (((struct __iar_u32 *)(PTR))->v)
#endif
#endif
#ifndef __USED
#if __ICCARM_V8
#define __USED __attribute__((used))
#else
#define __USED _Pragma("__root")
#endif
#endif
#ifndef __WEAK
#if __ICCARM_V8
#define __WEAK __attribute__((weak))
#else
#define __WEAK _Pragma("__weak")
#endif
#endif
#ifndef __ICCARM_INTRINSICS_VERSION__
#define __ICCARM_INTRINSICS_VERSION__ 0
#endif
#if __ICCARM_INTRINSICS_VERSION__ == 2
#if defined(__CLZ)
#undef __CLZ
#endif
#if defined(__REVSH)
#undef __REVSH
#endif
#if defined(__RBIT)
#undef __RBIT
#endif
#if defined(__SSAT)
#undef __SSAT
#endif
#if defined(__USAT)
#undef __USAT
#endif
#include "iccarm_builtin.h"
#define __enable_irq __iar_builtin_enable_interrupt
#define __disable_irq __iar_builtin_disable_interrupt
#define __enable_fault_irq __iar_builtin_enable_fiq
#define __disable_fault_irq __iar_builtin_disable_fiq
#define __arm_rsr __iar_builtin_rsr
#define __arm_wsr __iar_builtin_wsr
#if __FPU_PRESENT
#define __get_FPSCR() (__arm_rsr("FPSCR"))
#else
#define __get_FPSCR() ( 0 )
#endif
#define __set_FPSCR(VALUE) (__arm_wsr("FPSCR", VALUE))
#define __get_CPSR() (__arm_rsr("CPSR"))
#define __get_mode() (__get_CPSR() & 0x1FU)
#define __set_CPSR(VALUE) (__arm_wsr("CPSR", (VALUE)))
#define __set_mode(VALUE) (__arm_wsr("CPSR_c", (VALUE)))
#define __get_FPEXC() (__arm_rsr("FPEXC"))
#define __set_FPEXC(VALUE) (__arm_wsr("FPEXC", VALUE))
#define __get_CP(cp, op1, RT, CRn, CRm, op2) \
((RT) = __arm_rsr("p" # cp ":" # op1 ":c" # CRn ":c" # CRm ":" # op2))
#define __set_CP(cp, op1, RT, CRn, CRm, op2) \
(__arm_wsr("p" # cp ":" # op1 ":c" # CRn ":c" # CRm ":" # op2, (RT)))
#define __get_CP64(cp, op1, Rt, CRm) \
__ASM volatile("MRRC p" # cp ", " # op1 ", %Q0, %R0, c" # CRm : "=r" (Rt) : : "memory" )
#define __set_CP64(cp, op1, Rt, CRm) \
__ASM volatile("MCRR p" # cp ", " # op1 ", %Q0, %R0, c" # CRm : : "r" (Rt) : "memory" )
#include "cmsis_cp15.h"
#define __NOP __iar_builtin_no_operation
#define __CLZ __iar_builtin_CLZ
#define __CLREX __iar_builtin_CLREX
#define __DMB __iar_builtin_DMB
#define __DSB __iar_builtin_DSB
#define __ISB __iar_builtin_ISB
#define __LDREXB __iar_builtin_LDREXB
#define __LDREXH __iar_builtin_LDREXH
#define __LDREXW __iar_builtin_LDREX
#define __RBIT __iar_builtin_RBIT
#define __REV __iar_builtin_REV
#define __REV16 __iar_builtin_REV16
__IAR_FT int16_t __REVSH(int16_t val)
{
return (int16_t) __iar_builtin_REVSH(val);
}
#define __ROR __iar_builtin_ROR
#define __RRX __iar_builtin_RRX
#define __SEV __iar_builtin_SEV
#define __SSAT __iar_builtin_SSAT
#define __STREXB __iar_builtin_STREXB
#define __STREXH __iar_builtin_STREXH
#define __STREXW __iar_builtin_STREX
#define __USAT __iar_builtin_USAT
#define __WFE __iar_builtin_WFE
#define __WFI __iar_builtin_WFI
#define __SADD8 __iar_builtin_SADD8
#define __QADD8 __iar_builtin_QADD8
#define __SHADD8 __iar_builtin_SHADD8
#define __UADD8 __iar_builtin_UADD8
#define __UQADD8 __iar_builtin_UQADD8
#define __UHADD8 __iar_builtin_UHADD8
#define __SSUB8 __iar_builtin_SSUB8
#define __QSUB8 __iar_builtin_QSUB8
#define __SHSUB8 __iar_builtin_SHSUB8
#define __USUB8 __iar_builtin_USUB8
#define __UQSUB8 __iar_builtin_UQSUB8
#define __UHSUB8 __iar_builtin_UHSUB8
#define __SADD16 __iar_builtin_SADD16
#define __QADD16 __iar_builtin_QADD16
#define __SHADD16 __iar_builtin_SHADD16
#define __UADD16 __iar_builtin_UADD16
#define __UQADD16 __iar_builtin_UQADD16
#define __UHADD16 __iar_builtin_UHADD16
#define __SSUB16 __iar_builtin_SSUB16
#define __QSUB16 __iar_builtin_QSUB16
#define __SHSUB16 __iar_builtin_SHSUB16
#define __USUB16 __iar_builtin_USUB16
#define __UQSUB16 __iar_builtin_UQSUB16
#define __UHSUB16 __iar_builtin_UHSUB16
#define __SASX __iar_builtin_SASX
#define __QASX __iar_builtin_QASX
#define __SHASX __iar_builtin_SHASX
#define __UASX __iar_builtin_UASX
#define __UQASX __iar_builtin_UQASX
#define __UHASX __iar_builtin_UHASX
#define __SSAX __iar_builtin_SSAX
#define __QSAX __iar_builtin_QSAX
#define __SHSAX __iar_builtin_SHSAX
#define __USAX __iar_builtin_USAX
#define __UQSAX __iar_builtin_UQSAX
#define __UHSAX __iar_builtin_UHSAX
#define __USAD8 __iar_builtin_USAD8
#define __USADA8 __iar_builtin_USADA8
#define __SSAT16 __iar_builtin_SSAT16
#define __USAT16 __iar_builtin_USAT16
#define __UXTB16 __iar_builtin_UXTB16
#define __UXTAB16 __iar_builtin_UXTAB16
#define __SXTB16 __iar_builtin_SXTB16
#define __SXTAB16 __iar_builtin_SXTAB16
#define __SMUAD __iar_builtin_SMUAD
#define __SMUADX __iar_builtin_SMUADX
#define __SMMLA __iar_builtin_SMMLA
#define __SMLAD __iar_builtin_SMLAD
#define __SMLADX __iar_builtin_SMLADX
#define __SMLALD __iar_builtin_SMLALD
#define __SMLALDX __iar_builtin_SMLALDX
#define __SMUSD __iar_builtin_SMUSD
#define __SMUSDX __iar_builtin_SMUSDX
#define __SMLSD __iar_builtin_SMLSD
#define __SMLSDX __iar_builtin_SMLSDX
#define __SMLSLD __iar_builtin_SMLSLD
#define __SMLSLDX __iar_builtin_SMLSLDX
#define __SEL __iar_builtin_SEL
#define __QADD __iar_builtin_QADD
#define __QSUB __iar_builtin_QSUB
#define __PKHBT __iar_builtin_PKHBT
#define __PKHTB __iar_builtin_PKHTB
#else /* __ICCARM_INTRINSICS_VERSION__ == 2 */
#if !__FPU_PRESENT
#define __get_FPSCR __cmsis_iar_get_FPSR_not_active
#endif
#ifdef __INTRINSICS_INCLUDED
#error intrinsics.h is already included previously!
#endif
#include <intrinsics.h>
#if !__FPU_PRESENT
#define __get_FPSCR() (0)
#endif
#pragma diag_suppress=Pe940
#pragma diag_suppress=Pe177
#define __enable_irq __enable_interrupt
#define __disable_irq __disable_interrupt
#define __enable_fault_irq __enable_fiq
#define __disable_fault_irq __disable_fiq
#define __NOP __no_operation
#define __get_xPSR __get_PSR
__IAR_FT void __set_mode(uint32_t mode)
{
__ASM volatile("MSR cpsr_c, %0" : : "r" (mode) : "memory");
}
__IAR_FT uint32_t __LDREXW(uint32_t volatile *ptr)
{
return __LDREX((unsigned long *)ptr);
}
__IAR_FT uint32_t __STREXW(uint32_t value, uint32_t volatile *ptr)
{
return __STREX(value, (unsigned long *)ptr);
}
__IAR_FT uint32_t __RRX(uint32_t value)
{
uint32_t result;
__ASM("RRX %0, %1" : "=r"(result) : "r" (value) : "cc");
return(result);
}
__IAR_FT uint32_t __ROR(uint32_t op1, uint32_t op2)
{
return (op1 >> op2) | (op1 << ((sizeof(op1)*8)-op2));
}
__IAR_FT uint32_t __get_FPEXC(void)
{
#if (__FPU_PRESENT == 1)
uint32_t result;
__ASM volatile("VMRS %0, fpexc" : "=r" (result) : : "memory");
return(result);
#else
return(0);
#endif
}
__IAR_FT void __set_FPEXC(uint32_t fpexc)
{
#if (__FPU_PRESENT == 1)
__ASM volatile ("VMSR fpexc, %0" : : "r" (fpexc) : "memory");
#endif
}
#define __get_CP(cp, op1, Rt, CRn, CRm, op2) \
__ASM volatile("MRC p" # cp ", " # op1 ", %0, c" # CRn ", c" # CRm ", " # op2 : "=r" (Rt) : : "memory" )
#define __set_CP(cp, op1, Rt, CRn, CRm, op2) \
__ASM volatile("MCR p" # cp ", " # op1 ", %0, c" # CRn ", c" # CRm ", " # op2 : : "r" (Rt) : "memory" )
#define __get_CP64(cp, op1, Rt, CRm) \
__ASM volatile("MRRC p" # cp ", " # op1 ", %Q0, %R0, c" # CRm : "=r" (Rt) : : "memory" )
#define __set_CP64(cp, op1, Rt, CRm) \
__ASM volatile("MCRR p" # cp ", " # op1 ", %Q0, %R0, c" # CRm : : "r" (Rt) : "memory" )
#include "cmsis_cp15.h"
#endif /* __ICCARM_INTRINSICS_VERSION__ == 2 */
#define __BKPT(value) __asm volatile ("BKPT %0" : : "i"(value))
__IAR_FT uint32_t __get_SP_usr(void)
{
uint32_t cpsr;
uint32_t result;
__ASM volatile(
"MRS %0, cpsr \n"
"CPS #0x1F \n" // no effect in USR mode
"MOV %1, sp \n"
"MSR cpsr_c, %2 \n" // no effect in USR mode
"ISB" : "=r"(cpsr), "=r"(result) : "r"(cpsr) : "memory"
);
return result;
}
__IAR_FT void __set_SP_usr(uint32_t topOfProcStack)
{
uint32_t cpsr;
__ASM volatile(
"MRS %0, cpsr \n"
"CPS #0x1F \n" // no effect in USR mode
"MOV sp, %1 \n"
"MSR cpsr_c, %2 \n" // no effect in USR mode
"ISB" : "=r"(cpsr) : "r" (topOfProcStack), "r"(cpsr) : "memory"
);
}
#define __get_mode() (__get_CPSR() & 0x1FU)
__STATIC_INLINE
void __FPU_Enable(void)
{
__ASM volatile(
//Permit access to VFP/NEON, registers by modifying CPACR
" MRC p15,0,R1,c1,c0,2 \n"
" ORR R1,R1,#0x00F00000 \n"
" MCR p15,0,R1,c1,c0,2 \n"
//Ensure that subsequent instructions occur in the context of VFP/NEON access permitted
" ISB \n"
//Enable VFP/NEON
" VMRS R1,FPEXC \n"
" ORR R1,R1,#0x40000000 \n"
" VMSR FPEXC,R1 \n"
//Initialise VFP/NEON registers to 0
" MOV R2,#0 \n"
//Initialise D16 registers to 0
" VMOV D0, R2,R2 \n"
" VMOV D1, R2,R2 \n"
" VMOV D2, R2,R2 \n"
" VMOV D3, R2,R2 \n"
" VMOV D4, R2,R2 \n"
" VMOV D5, R2,R2 \n"
" VMOV D6, R2,R2 \n"
" VMOV D7, R2,R2 \n"
" VMOV D8, R2,R2 \n"
" VMOV D9, R2,R2 \n"
" VMOV D10,R2,R2 \n"
" VMOV D11,R2,R2 \n"
" VMOV D12,R2,R2 \n"
" VMOV D13,R2,R2 \n"
" VMOV D14,R2,R2 \n"
" VMOV D15,R2,R2 \n"
#ifdef __ARM_ADVANCED_SIMD__
//Initialise D32 registers to 0
" VMOV D16,R2,R2 \n"
" VMOV D17,R2,R2 \n"
" VMOV D18,R2,R2 \n"
" VMOV D19,R2,R2 \n"
" VMOV D20,R2,R2 \n"
" VMOV D21,R2,R2 \n"
" VMOV D22,R2,R2 \n"
" VMOV D23,R2,R2 \n"
" VMOV D24,R2,R2 \n"
" VMOV D25,R2,R2 \n"
" VMOV D26,R2,R2 \n"
" VMOV D27,R2,R2 \n"
" VMOV D28,R2,R2 \n"
" VMOV D29,R2,R2 \n"
" VMOV D30,R2,R2 \n"
" VMOV D31,R2,R2 \n"
#endif
//Initialise FPSCR to a known state
" VMRS R1,FPSCR \n"
" MOV32 R2,#0x00086060 \n" //Mask off all bits that do not have to be preserved. Non-preserved bits can/should be zero.
" AND R1,R1,R2 \n"
" VMSR FPSCR,R1 \n"
: : : "cc", "r1", "r2"
);
}
#undef __IAR_FT
#undef __ICCARM_V8
#pragma diag_default=Pe940
#pragma diag_default=Pe177
#endif /* __CMSIS_ICCARM_H__ */

File diff suppressed because it is too large Load Diff

View File

@ -1,192 +0,0 @@
/**************************************************************************//**
* @file irq_ctrl.h
* @brief Interrupt Controller API header file
* @version V1.1.0
* @date 03. March 2020
******************************************************************************/
/*
* Copyright (c) 2017-2020 ARM Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#if defined ( __ICCARM__ )
#pragma system_include /* treat file as system include file for MISRA check */
#elif defined (__clang__)
#pragma clang system_header /* treat file as system include file */
#endif
#ifndef IRQ_CTRL_H_
#define IRQ_CTRL_H_
#include <stdint.h>
#ifndef IRQHANDLER_T
#define IRQHANDLER_T
/// Interrupt handler data type
typedef void (*IRQHandler_t) (void);
#endif
#ifndef IRQN_ID_T
#define IRQN_ID_T
/// Interrupt ID number data type
typedef int32_t IRQn_ID_t;
#endif
/* Interrupt mode bit-masks */
#define IRQ_MODE_TRIG_Pos (0U)
#define IRQ_MODE_TRIG_Msk (0x07UL /*<< IRQ_MODE_TRIG_Pos*/)
#define IRQ_MODE_TRIG_LEVEL (0x00UL /*<< IRQ_MODE_TRIG_Pos*/) ///< Trigger: level triggered interrupt
#define IRQ_MODE_TRIG_LEVEL_LOW (0x01UL /*<< IRQ_MODE_TRIG_Pos*/) ///< Trigger: low level triggered interrupt
#define IRQ_MODE_TRIG_LEVEL_HIGH (0x02UL /*<< IRQ_MODE_TRIG_Pos*/) ///< Trigger: high level triggered interrupt
#define IRQ_MODE_TRIG_EDGE (0x04UL /*<< IRQ_MODE_TRIG_Pos*/) ///< Trigger: edge triggered interrupt
#define IRQ_MODE_TRIG_EDGE_RISING (0x05UL /*<< IRQ_MODE_TRIG_Pos*/) ///< Trigger: rising edge triggered interrupt
#define IRQ_MODE_TRIG_EDGE_FALLING (0x06UL /*<< IRQ_MODE_TRIG_Pos*/) ///< Trigger: falling edge triggered interrupt
#define IRQ_MODE_TRIG_EDGE_BOTH (0x07UL /*<< IRQ_MODE_TRIG_Pos*/) ///< Trigger: rising and falling edge triggered interrupt
#define IRQ_MODE_TYPE_Pos (3U)
#define IRQ_MODE_TYPE_Msk (0x01UL << IRQ_MODE_TYPE_Pos)
#define IRQ_MODE_TYPE_IRQ (0x00UL << IRQ_MODE_TYPE_Pos) ///< Type: interrupt source triggers CPU IRQ line
#define IRQ_MODE_TYPE_FIQ (0x01UL << IRQ_MODE_TYPE_Pos) ///< Type: interrupt source triggers CPU FIQ line
#define IRQ_MODE_DOMAIN_Pos (4U)
#define IRQ_MODE_DOMAIN_Msk (0x01UL << IRQ_MODE_DOMAIN_Pos)
#define IRQ_MODE_DOMAIN_NONSECURE (0x00UL << IRQ_MODE_DOMAIN_Pos) ///< Domain: interrupt is targeting non-secure domain
#define IRQ_MODE_DOMAIN_SECURE (0x01UL << IRQ_MODE_DOMAIN_Pos) ///< Domain: interrupt is targeting secure domain
#define IRQ_MODE_CPU_Pos (5U)
#define IRQ_MODE_CPU_Msk (0xFFUL << IRQ_MODE_CPU_Pos)
#define IRQ_MODE_CPU_ALL (0x00UL << IRQ_MODE_CPU_Pos) ///< CPU: interrupt targets all CPUs
#define IRQ_MODE_CPU_0 (0x01UL << IRQ_MODE_CPU_Pos) ///< CPU: interrupt targets CPU 0
#define IRQ_MODE_CPU_1 (0x02UL << IRQ_MODE_CPU_Pos) ///< CPU: interrupt targets CPU 1
#define IRQ_MODE_CPU_2 (0x04UL << IRQ_MODE_CPU_Pos) ///< CPU: interrupt targets CPU 2
#define IRQ_MODE_CPU_3 (0x08UL << IRQ_MODE_CPU_Pos) ///< CPU: interrupt targets CPU 3
#define IRQ_MODE_CPU_4 (0x10UL << IRQ_MODE_CPU_Pos) ///< CPU: interrupt targets CPU 4
#define IRQ_MODE_CPU_5 (0x20UL << IRQ_MODE_CPU_Pos) ///< CPU: interrupt targets CPU 5
#define IRQ_MODE_CPU_6 (0x40UL << IRQ_MODE_CPU_Pos) ///< CPU: interrupt targets CPU 6
#define IRQ_MODE_CPU_7 (0x80UL << IRQ_MODE_CPU_Pos) ///< CPU: interrupt targets CPU 7
// Encoding in some early GIC implementations
#define IRQ_MODE_MODEL_Pos (13U)
#define IRQ_MODE_MODEL_Msk (0x1UL << IRQ_MODE_MODEL_Pos)
#define IRQ_MODE_MODEL_NN (0x0UL << IRQ_MODE_MODEL_Pos) ///< Corresponding interrupt is handled using the N-N model
#define IRQ_MODE_MODEL_1N (0x1UL << IRQ_MODE_MODEL_Pos) ///< Corresponding interrupt is handled using the 1-N model
#define IRQ_MODE_ERROR (0x80000000UL) ///< Bit indicating mode value error
/* Interrupt priority bit-masks */
#define IRQ_PRIORITY_Msk (0x0000FFFFUL) ///< Interrupt priority value bit-mask
#define IRQ_PRIORITY_ERROR (0x80000000UL) ///< Bit indicating priority value error
/// Initialize interrupt controller.
/// \return 0 on success, -1 on error.
int32_t IRQ_Initialize (void);
/// Register interrupt handler.
/// \param[in] irqn interrupt ID number
/// \param[in] handler interrupt handler function address
/// \return 0 on success, -1 on error.
int32_t IRQ_SetHandler (IRQn_ID_t irqn, IRQHandler_t handler);
/// Get the registered interrupt handler.
/// \param[in] irqn interrupt ID number
/// \return registered interrupt handler function address.
IRQHandler_t IRQ_GetHandler (IRQn_ID_t irqn);
/// Enable interrupt.
/// \param[in] irqn interrupt ID number
/// \return 0 on success, -1 on error.
int32_t IRQ_Enable (IRQn_ID_t irqn);
/// Disable interrupt.
/// \param[in] irqn interrupt ID number
/// \return 0 on success, -1 on error.
int32_t IRQ_Disable (IRQn_ID_t irqn);
/// Get interrupt enable state.
/// \param[in] irqn interrupt ID number
/// \return 0 - interrupt is disabled, 1 - interrupt is enabled.
uint32_t IRQ_GetEnableState (IRQn_ID_t irqn);
/// Configure interrupt request mode.
/// \param[in] irqn interrupt ID number
/// \param[in] mode mode configuration
/// \return 0 on success, -1 on error.
int32_t IRQ_SetMode (IRQn_ID_t irqn, uint32_t mode);
/// Get interrupt mode configuration.
/// \param[in] irqn interrupt ID number
/// \return current interrupt mode configuration with optional IRQ_MODE_ERROR bit set.
uint32_t IRQ_GetMode (IRQn_ID_t irqn);
/// Get ID number of current interrupt request (IRQ).
/// \return interrupt ID number.
IRQn_ID_t IRQ_GetActiveIRQ (void);
/// Get ID number of current fast interrupt request (FIQ).
/// \return interrupt ID number.
IRQn_ID_t IRQ_GetActiveFIQ (void);
/// Signal end of interrupt processing.
/// \param[in] irqn interrupt ID number
/// \return 0 on success, -1 on error.
int32_t IRQ_EndOfInterrupt (IRQn_ID_t irqn);
/// Set interrupt pending flag.
/// \param[in] irqn interrupt ID number
/// \return 0 on success, -1 on error.
int32_t IRQ_SetPending (IRQn_ID_t irqn);
/// Get interrupt pending flag.
/// \param[in] irqn interrupt ID number
/// \return 0 - interrupt is not pending, 1 - interrupt is pending.
uint32_t IRQ_GetPending (IRQn_ID_t irqn);
/// Clear interrupt pending flag.
/// \param[in] irqn interrupt ID number
/// \return 0 on success, -1 on error.
int32_t IRQ_ClearPending (IRQn_ID_t irqn);
/// Set interrupt priority value.
/// \param[in] irqn interrupt ID number
/// \param[in] priority interrupt priority value
/// \return 0 on success, -1 on error.
int32_t IRQ_SetPriority (IRQn_ID_t irqn, uint32_t priority);
/// Get interrupt priority.
/// \param[in] irqn interrupt ID number
/// \return current interrupt priority value with optional IRQ_PRIORITY_ERROR bit set.
uint32_t IRQ_GetPriority (IRQn_ID_t irqn);
/// Set priority masking threshold.
/// \param[in] priority priority masking threshold value
/// \return 0 on success, -1 on error.
int32_t IRQ_SetPriorityMask (uint32_t priority);
/// Get priority masking threshold
/// \return current priority masking threshold value with optional IRQ_PRIORITY_ERROR bit set.
uint32_t IRQ_GetPriorityMask (void);
/// Set priority grouping field split point
/// \param[in] bits number of MSB bits included in the group priority field comparison
/// \return 0 on success, -1 on error.
int32_t IRQ_SetPriorityGroupBits (uint32_t bits);
/// Get priority grouping field split point
/// \return current number of MSB bits included in the group priority field comparison with
/// optional IRQ_PRIORITY_ERROR bit set.
uint32_t IRQ_GetPriorityGroupBits (void);
#endif // IRQ_CTRL_H_

View File

@ -1,418 +0,0 @@
/**************************************************************************//**
* @file irq_ctrl_gic.c
* @brief Interrupt controller handling implementation for GIC
* @version V1.1.1
* @date 29. March 2021
******************************************************************************/
/*
* Copyright (c) 2017-2021 ARM Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stddef.h>
#include "RTE_Components.h"
#include CMSIS_device_header
#include "irq_ctrl.h"
#if defined(__GIC_PRESENT) && (__GIC_PRESENT == 1U)
/// Number of implemented interrupt lines
#ifndef IRQ_GIC_LINE_COUNT
#define IRQ_GIC_LINE_COUNT (1020U)
#endif
static IRQHandler_t IRQTable[IRQ_GIC_LINE_COUNT] = { 0U };
static uint32_t IRQ_ID0;
/// Initialize interrupt controller.
__WEAK int32_t IRQ_Initialize (void) {
uint32_t i;
for (i = 0U; i < IRQ_GIC_LINE_COUNT; i++) {
IRQTable[i] = (IRQHandler_t)NULL;
}
GIC_Enable();
return (0);
}
/// Register interrupt handler.
__WEAK int32_t IRQ_SetHandler (IRQn_ID_t irqn, IRQHandler_t handler) {
int32_t status;
if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
IRQTable[irqn] = handler;
status = 0;
} else {
status = -1;
}
return (status);
}
/// Get the registered interrupt handler.
__WEAK IRQHandler_t IRQ_GetHandler (IRQn_ID_t irqn) {
IRQHandler_t h;
// Ignore CPUID field (software generated interrupts)
irqn &= 0x3FFU;
if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
h = IRQTable[irqn];
} else {
h = (IRQHandler_t)0;
}
return (h);
}
/// Enable interrupt.
__WEAK int32_t IRQ_Enable (IRQn_ID_t irqn) {
int32_t status;
if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
GIC_EnableIRQ ((IRQn_Type)irqn);
status = 0;
} else {
status = -1;
}
return (status);
}
/// Disable interrupt.
__WEAK int32_t IRQ_Disable (IRQn_ID_t irqn) {
int32_t status;
if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
GIC_DisableIRQ ((IRQn_Type)irqn);
status = 0;
} else {
status = -1;
}
return (status);
}
/// Get interrupt enable state.
__WEAK uint32_t IRQ_GetEnableState (IRQn_ID_t irqn) {
uint32_t enable;
if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
enable = GIC_GetEnableIRQ((IRQn_Type)irqn);
} else {
enable = 0U;
}
return (enable);
}
/// Configure interrupt request mode.
__WEAK int32_t IRQ_SetMode (IRQn_ID_t irqn, uint32_t mode) {
uint32_t val;
uint8_t cfg;
uint8_t secure;
uint8_t cpu;
int32_t status = 0;
if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
// Check triggering mode
val = (mode & IRQ_MODE_TRIG_Msk);
if (val == IRQ_MODE_TRIG_LEVEL) {
cfg = 0x00U;
} else if (val == IRQ_MODE_TRIG_EDGE) {
cfg = 0x02U;
} else {
cfg = 0x00U;
status = -1;
}
val = (mode & IRQ_MODE_MODEL_Msk);
if (val == IRQ_MODE_MODEL_1N) {
cfg |= 1; // 1-N model
}
// Check interrupt type
val = mode & IRQ_MODE_TYPE_Msk;
if (val != IRQ_MODE_TYPE_IRQ) {
status = -1;
}
// Check interrupt domain
val = mode & IRQ_MODE_DOMAIN_Msk;
if (val == IRQ_MODE_DOMAIN_NONSECURE) {
secure = 0U;
} else {
// Check security extensions support
val = GIC_DistributorInfo() & (1UL << 10U);
if (val != 0U) {
// Security extensions are supported
secure = 1U;
} else {
secure = 0U;
status = -1;
}
}
// Check interrupt CPU targets
val = mode & IRQ_MODE_CPU_Msk;
if (val == IRQ_MODE_CPU_ALL) {
cpu = 0xFFU;
} else {
cpu = (uint8_t)(val >> IRQ_MODE_CPU_Pos);
}
// Apply configuration if no mode error
if (status == 0) {
GIC_SetConfiguration((IRQn_Type)irqn, cfg);
GIC_SetTarget ((IRQn_Type)irqn, cpu);
if (secure != 0U) {
GIC_SetGroup ((IRQn_Type)irqn, secure);
}
}
}
return (status);
}
/// Get interrupt mode configuration.
__WEAK uint32_t IRQ_GetMode (IRQn_ID_t irqn) {
uint32_t mode;
uint32_t val;
if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
mode = IRQ_MODE_TYPE_IRQ;
// Get trigger mode
val = GIC_GetConfiguration((IRQn_Type)irqn);
if ((val & 2U) != 0U) {
// Corresponding interrupt is edge triggered
mode |= IRQ_MODE_TRIG_EDGE;
} else {
// Corresponding interrupt is level triggered
mode |= IRQ_MODE_TRIG_LEVEL;
}
if (val & 1U) {
mode |= IRQ_MODE_MODEL_1N;
}
// Get interrupt CPU targets
mode |= GIC_GetTarget ((IRQn_Type)irqn) << IRQ_MODE_CPU_Pos;
} else {
mode = IRQ_MODE_ERROR;
}
return (mode);
}
/// Get ID number of current interrupt request (IRQ).
__WEAK IRQn_ID_t IRQ_GetActiveIRQ (void) {
IRQn_ID_t irqn;
uint32_t prio;
/* Dummy read to avoid GIC 390 errata 801120 */
GIC_GetHighPendingIRQ();
irqn = GIC_AcknowledgePending();
__DSB();
/* Workaround GIC 390 errata 733075 (GIC-390_Errata_Notice_v6.pdf, 09-Jul-2014) */
/* The following workaround code is for a single-core system. It would be */
/* different in a multi-core system. */
/* If the ID is 0 or 0x3FE or 0x3FF, then the GIC CPU interface may be locked-up */
/* so unlock it, otherwise service the interrupt as normal. */
/* Special IDs 1020=0x3FC and 1021=0x3FD are reserved values in GICv1 and GICv2 */
/* so will not occur here. */
if ((irqn == 0) || (irqn >= 0x3FE)) {
/* Unlock the CPU interface with a dummy write to Interrupt Priority Register */
prio = GIC_GetPriority((IRQn_Type)0);
GIC_SetPriority ((IRQn_Type)0, prio);
__DSB();
if ((irqn == 0U) && ((GIC_GetIRQStatus ((IRQn_Type)irqn) & 1U) != 0U) && (IRQ_ID0 == 0U)) {
/* If the ID is 0, is active and has not been seen before */
IRQ_ID0 = 1U;
}
/* End of Workaround GIC 390 errata 733075 */
}
return (irqn);
}
/// Get ID number of current fast interrupt request (FIQ).
__WEAK IRQn_ID_t IRQ_GetActiveFIQ (void) {
return ((IRQn_ID_t)-1);
}
/// Signal end of interrupt processing.
__WEAK int32_t IRQ_EndOfInterrupt (IRQn_ID_t irqn) {
int32_t status;
IRQn_Type irq = (IRQn_Type)irqn;
irqn &= 0x3FFU;
if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
GIC_EndInterrupt (irq);
if (irqn == 0) {
IRQ_ID0 = 0U;
}
status = 0;
} else {
status = -1;
}
return (status);
}
/// Set interrupt pending flag.
__WEAK int32_t IRQ_SetPending (IRQn_ID_t irqn) {
int32_t status;
if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
GIC_SetPendingIRQ ((IRQn_Type)irqn);
status = 0;
} else {
status = -1;
}
return (status);
}
/// Get interrupt pending flag.
__WEAK uint32_t IRQ_GetPending (IRQn_ID_t irqn) {
uint32_t pending;
if ((irqn >= 16) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
pending = GIC_GetPendingIRQ ((IRQn_Type)irqn);
} else {
pending = 0U;
}
return (pending & 1U);
}
/// Clear interrupt pending flag.
__WEAK int32_t IRQ_ClearPending (IRQn_ID_t irqn) {
int32_t status;
if ((irqn >= 16) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
GIC_ClearPendingIRQ ((IRQn_Type)irqn);
status = 0;
} else {
status = -1;
}
return (status);
}
/// Set interrupt priority value.
__WEAK int32_t IRQ_SetPriority (IRQn_ID_t irqn, uint32_t priority) {
int32_t status;
if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
GIC_SetPriority ((IRQn_Type)irqn, priority);
status = 0;
} else {
status = -1;
}
return (status);
}
/// Get interrupt priority.
__WEAK uint32_t IRQ_GetPriority (IRQn_ID_t irqn) {
uint32_t priority;
if ((irqn >= 0) && (irqn < (IRQn_ID_t)IRQ_GIC_LINE_COUNT)) {
priority = GIC_GetPriority ((IRQn_Type)irqn);
} else {
priority = IRQ_PRIORITY_ERROR;
}
return (priority);
}
/// Set priority masking threshold.
__WEAK int32_t IRQ_SetPriorityMask (uint32_t priority) {
GIC_SetInterfacePriorityMask (priority);
return (0);
}
/// Get priority masking threshold
__WEAK uint32_t IRQ_GetPriorityMask (void) {
return GIC_GetInterfacePriorityMask();
}
/// Set priority grouping field split point
__WEAK int32_t IRQ_SetPriorityGroupBits (uint32_t bits) {
int32_t status;
if (bits == IRQ_PRIORITY_Msk) {
bits = 7U;
}
if (bits < 8U) {
GIC_SetBinaryPoint (7U - bits);
status = 0;
} else {
status = -1;
}
return (status);
}
/// Get priority grouping field split point
__WEAK uint32_t IRQ_GetPriorityGroupBits (void) {
uint32_t bp;
bp = GIC_GetBinaryPoint() & 0x07U;
return (7U - bp);
}
#endif

View File

@ -1,561 +0,0 @@
/*
* Copyright (c) 2013-2021 ARM Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* ----------------------------------------------------------------------
*
* $Date: 16. June 2021
* $Revision: V2.1.0
*
* Project: CMSIS-DAP Configuration
* Title: DAP_config.h CMSIS-DAP Configuration File (Template)
*
*---------------------------------------------------------------------------*/
#ifndef __DAP_CONFIG_H__
#define __DAP_CONFIG_H__
//**************************************************************************************************
/**
\defgroup DAP_Config_Debug_gr CMSIS-DAP Debug Unit Information
\ingroup DAP_ConfigIO_gr
@{
Provides definitions about the hardware and configuration of the Debug Unit.
This information includes:
- Definition of Cortex-M processor parameters used in CMSIS-DAP Debug Unit.
- Debug Unit Identification strings (Vendor, Product, Serial Number).
- Debug Unit communication packet size.
- Debug Access Port supported modes and settings (JTAG/SWD and SWO).
- Optional information about a connected Target Device (for Evaluation Boards).
*/
#ifdef _RTE_
#include "RTE_Components.h"
#include CMSIS_device_header
#else
#include "device.h" // Debug Unit Cortex-M Processor Header File
#endif
/// Processor Clock of the Cortex-M MCU used in the Debug Unit.
/// This value is used to calculate the SWD/JTAG clock speed.
#define CPU_CLOCK 100000000U ///< Specifies the CPU Clock in Hz.
/// Number of processor cycles for I/O Port write operations.
/// This value is used to calculate the SWD/JTAG clock speed that is generated with I/O
/// Port write operations in the Debug Unit by a Cortex-M MCU. Most Cortex-M processors
/// require 2 processor cycles for a I/O Port Write operation. If the Debug Unit uses
/// a Cortex-M0+ processor with high-speed peripheral I/O only 1 processor cycle might be
/// required.
#define IO_PORT_WRITE_CYCLES 2U ///< I/O Cycles: 2=default, 1=Cortex-M0+ fast I/0.
/// Indicate that Serial Wire Debug (SWD) communication mode is available at the Debug Access Port.
/// This information is returned by the command \ref DAP_Info as part of <b>Capabilities</b>.
#define DAP_SWD 1 ///< SWD Mode: 1 = available, 0 = not available.
/// Indicate that JTAG communication mode is available at the Debug Port.
/// This information is returned by the command \ref DAP_Info as part of <b>Capabilities</b>.
#define DAP_JTAG 1 ///< JTAG Mode: 1 = available, 0 = not available.
/// Configure maximum number of JTAG devices on the scan chain connected to the Debug Access Port.
/// This setting impacts the RAM requirements of the Debug Unit. Valid range is 1 .. 255.
#define DAP_JTAG_DEV_CNT 8U ///< Maximum number of JTAG devices on scan chain.
/// Default communication mode on the Debug Access Port.
/// Used for the command \ref DAP_Connect when Port Default mode is selected.
#define DAP_DEFAULT_PORT 1U ///< Default JTAG/SWJ Port Mode: 1 = SWD, 2 = JTAG.
/// Default communication speed on the Debug Access Port for SWD and JTAG mode.
/// Used to initialize the default SWD/JTAG clock frequency.
/// The command \ref DAP_SWJ_Clock can be used to overwrite this default setting.
#define DAP_DEFAULT_SWJ_CLOCK 1000000U ///< Default SWD/JTAG clock frequency in Hz.
/// Maximum Package Size for Command and Response data.
/// This configuration settings is used to optimize the communication performance with the
/// debugger and depends on the USB peripheral. Typical vales are 64 for Full-speed USB HID or WinUSB,
/// 1024 for High-speed USB HID and 512 for High-speed USB WinUSB.
#define DAP_PACKET_SIZE 512U ///< Specifies Packet Size in bytes.
/// Maximum Package Buffers for Command and Response data.
/// This configuration settings is used to optimize the communication performance with the
/// debugger and depends on the USB peripheral. For devices with limited RAM or USB buffer the
/// setting can be reduced (valid range is 1 .. 255).
#define DAP_PACKET_COUNT 8U ///< Specifies number of packets buffered.
/// Indicate that UART Serial Wire Output (SWO) trace is available.
/// This information is returned by the command \ref DAP_Info as part of <b>Capabilities</b>.
#define SWO_UART 1 ///< SWO UART: 1 = available, 0 = not available.
/// USART Driver instance number for the UART SWO.
#define SWO_UART_DRIVER 0 ///< USART Driver instance number (Driver_USART#).
/// Maximum SWO UART Baudrate.
#define SWO_UART_MAX_BAUDRATE 10000000U ///< SWO UART Maximum Baudrate in Hz.
/// Indicate that Manchester Serial Wire Output (SWO) trace is available.
/// This information is returned by the command \ref DAP_Info as part of <b>Capabilities</b>.
#define SWO_MANCHESTER 0 ///< SWO Manchester: 1 = available, 0 = not available.
/// SWO Trace Buffer Size.
#define SWO_BUFFER_SIZE 4096U ///< SWO Trace Buffer Size in bytes (must be 2^n).
/// SWO Streaming Trace.
#define SWO_STREAM 0 ///< SWO Streaming Trace: 1 = available, 0 = not available.
/// Clock frequency of the Test Domain Timer. Timer value is returned with \ref TIMESTAMP_GET.
#define TIMESTAMP_CLOCK 100000000U ///< Timestamp clock in Hz (0 = timestamps not supported).
/// Indicate that UART Communication Port is available.
/// This information is returned by the command \ref DAP_Info as part of <b>Capabilities</b>.
#define DAP_UART 1 ///< DAP UART: 1 = available, 0 = not available.
/// USART Driver instance number for the UART Communication Port.
#define DAP_UART_DRIVER 1 ///< USART Driver instance number (Driver_USART#).
/// UART Receive Buffer Size.
#define DAP_UART_RX_BUFFER_SIZE 1024U ///< Uart Receive Buffer Size in bytes (must be 2^n).
/// UART Transmit Buffer Size.
#define DAP_UART_TX_BUFFER_SIZE 1024U ///< Uart Transmit Buffer Size in bytes (must be 2^n).
/// Indicate that UART Communication via USB COM Port is available.
/// This information is returned by the command \ref DAP_Info as part of <b>Capabilities</b>.
#define DAP_UART_USB_COM_PORT 1 ///< USB COM Port: 1 = available, 0 = not available.
/// Debug Unit is connected to fixed Target Device.
/// The Debug Unit may be part of an evaluation board and always connected to a fixed
/// known device. In this case a Device Vendor, Device Name, Board Vendor and Board Name strings
/// are stored and may be used by the debugger or IDE to configure device parameters.
#define TARGET_FIXED 0 ///< Target: 1 = known, 0 = unknown;
#define TARGET_DEVICE_VENDOR "Arm" ///< String indicating the Silicon Vendor
#define TARGET_DEVICE_NAME "Cortex-M" ///< String indicating the Target Device
#define TARGET_BOARD_VENDOR "Arm" ///< String indicating the Board Vendor
#define TARGET_BOARD_NAME "Arm board" ///< String indicating the Board Name
#if TARGET_FIXED != 0
#include <string.h>
static const char TargetDeviceVendor [] = TARGET_DEVICE_VENDOR;
static const char TargetDeviceName [] = TARGET_DEVICE_NAME;
static const char TargetBoardVendor [] = TARGET_BOARD_VENDOR;
static const char TargetBoardName [] = TARGET_BOARD_NAME;
#endif
/** Get Vendor Name string.
\param str Pointer to buffer to store the string (max 60 characters).
\return String length (including terminating NULL character) or 0 (no string).
*/
__STATIC_INLINE uint8_t DAP_GetVendorString (char *str) {
(void)str;
return (0U);
}
/** Get Product Name string.
\param str Pointer to buffer to store the string (max 60 characters).
\return String length (including terminating NULL character) or 0 (no string).
*/
__STATIC_INLINE uint8_t DAP_GetProductString (char *str) {
(void)str;
return (0U);
}
/** Get Serial Number string.
\param str Pointer to buffer to store the string (max 60 characters).
\return String length (including terminating NULL character) or 0 (no string).
*/
__STATIC_INLINE uint8_t DAP_GetSerNumString (char *str) {
(void)str;
return (0U);
}
/** Get Target Device Vendor string.
\param str Pointer to buffer to store the string (max 60 characters).
\return String length (including terminating NULL character) or 0 (no string).
*/
__STATIC_INLINE uint8_t DAP_GetTargetDeviceVendorString (char *str) {
#if TARGET_FIXED != 0
uint8_t len;
strcpy(str, TargetDeviceVendor);
len = (uint8_t)(strlen(TargetDeviceVendor) + 1U);
return (len);
#else
(void)str;
return (0U);
#endif
}
/** Get Target Device Name string.
\param str Pointer to buffer to store the string (max 60 characters).
\return String length (including terminating NULL character) or 0 (no string).
*/
__STATIC_INLINE uint8_t DAP_GetTargetDeviceNameString (char *str) {
#if TARGET_FIXED != 0
uint8_t len;
strcpy(str, TargetDeviceName);
len = (uint8_t)(strlen(TargetDeviceName) + 1U);
return (len);
#else
(void)str;
return (0U);
#endif
}
/** Get Target Board Vendor string.
\param str Pointer to buffer to store the string (max 60 characters).
\return String length (including terminating NULL character) or 0 (no string).
*/
__STATIC_INLINE uint8_t DAP_GetTargetBoardVendorString (char *str) {
#if TARGET_FIXED != 0
uint8_t len;
strcpy(str, TargetBoardVendor);
len = (uint8_t)(strlen(TargetBoardVendor) + 1U);
return (len);
#else
(void)str;
return (0U);
#endif
}
/** Get Target Board Name string.
\param str Pointer to buffer to store the string (max 60 characters).
\return String length (including terminating NULL character) or 0 (no string).
*/
__STATIC_INLINE uint8_t DAP_GetTargetBoardNameString (char *str) {
#if TARGET_FIXED != 0
uint8_t len;
strcpy(str, TargetBoardName);
len = (uint8_t)(strlen(TargetBoardName) + 1U);
return (len);
#else
(void)str;
return (0U);
#endif
}
/** Get Product Firmware Version string.
\param str Pointer to buffer to store the string (max 60 characters).
\return String length (including terminating NULL character) or 0 (no string).
*/
__STATIC_INLINE uint8_t DAP_GetProductFirmwareVersionString (char *str) {
(void)str;
return (0U);
}
///@}
//**************************************************************************************************
/**
\defgroup DAP_Config_PortIO_gr CMSIS-DAP Hardware I/O Pin Access
\ingroup DAP_ConfigIO_gr
@{
Standard I/O Pins of the CMSIS-DAP Hardware Debug Port support standard JTAG mode
and Serial Wire Debug (SWD) mode. In SWD mode only 2 pins are required to implement the debug
interface of a device. The following I/O Pins are provided:
JTAG I/O Pin | SWD I/O Pin | CMSIS-DAP Hardware pin mode
---------------------------- | -------------------- | ---------------------------------------------
TCK: Test Clock | SWCLK: Clock | Output Push/Pull
TMS: Test Mode Select | SWDIO: Data I/O | Output Push/Pull; Input (for receiving data)
TDI: Test Data Input | | Output Push/Pull
TDO: Test Data Output | | Input
nTRST: Test Reset (optional) | | Output Open Drain with pull-up resistor
nRESET: Device Reset | nRESET: Device Reset | Output Open Drain with pull-up resistor
DAP Hardware I/O Pin Access Functions
-------------------------------------
The various I/O Pins are accessed by functions that implement the Read, Write, Set, or Clear to
these I/O Pins.
For the SWDIO I/O Pin there are additional functions that are called in SWD I/O mode only.
This functions are provided to achieve faster I/O that is possible with some advanced GPIO
peripherals that can independently write/read a single I/O pin without affecting any other pins
of the same I/O port. The following SWDIO I/O Pin functions are provided:
- \ref PIN_SWDIO_OUT_ENABLE to enable the output mode from the DAP hardware.
- \ref PIN_SWDIO_OUT_DISABLE to enable the input mode to the DAP hardware.
- \ref PIN_SWDIO_IN to read from the SWDIO I/O pin with utmost possible speed.
- \ref PIN_SWDIO_OUT to write to the SWDIO I/O pin with utmost possible speed.
*/
// Configure DAP I/O pins ------------------------------
/** Setup JTAG I/O pins: TCK, TMS, TDI, TDO, nTRST, and nRESET.
Configures the DAP Hardware I/O pins for JTAG mode:
- TCK, TMS, TDI, nTRST, nRESET to output mode and set to high level.
- TDO to input mode.
*/
__STATIC_INLINE void PORT_JTAG_SETUP (void) {
;
}
/** Setup SWD I/O pins: SWCLK, SWDIO, and nRESET.
Configures the DAP Hardware I/O pins for Serial Wire Debug (SWD) mode:
- SWCLK, SWDIO, nRESET to output mode and set to default high level.
- TDI, nTRST to HighZ mode (pins are unused in SWD mode).
*/
__STATIC_INLINE void PORT_SWD_SETUP (void) {
;
}
/** Disable JTAG/SWD I/O Pins.
Disables the DAP Hardware I/O pins which configures:
- TCK/SWCLK, TMS/SWDIO, TDI, TDO, nTRST, nRESET to High-Z mode.
*/
__STATIC_INLINE void PORT_OFF (void) {
;
}
// SWCLK/TCK I/O pin -------------------------------------
/** SWCLK/TCK I/O pin: Get Input.
\return Current status of the SWCLK/TCK DAP hardware I/O pin.
*/
__STATIC_FORCEINLINE uint32_t PIN_SWCLK_TCK_IN (void) {
return (0U);
}
/** SWCLK/TCK I/O pin: Set Output to High.
Set the SWCLK/TCK DAP hardware I/O pin to high level.
*/
__STATIC_FORCEINLINE void PIN_SWCLK_TCK_SET (void) {
;
}
/** SWCLK/TCK I/O pin: Set Output to Low.
Set the SWCLK/TCK DAP hardware I/O pin to low level.
*/
__STATIC_FORCEINLINE void PIN_SWCLK_TCK_CLR (void) {
;
}
// SWDIO/TMS Pin I/O --------------------------------------
/** SWDIO/TMS I/O pin: Get Input.
\return Current status of the SWDIO/TMS DAP hardware I/O pin.
*/
__STATIC_FORCEINLINE uint32_t PIN_SWDIO_TMS_IN (void) {
return (0U);
}
/** SWDIO/TMS I/O pin: Set Output to High.
Set the SWDIO/TMS DAP hardware I/O pin to high level.
*/
__STATIC_FORCEINLINE void PIN_SWDIO_TMS_SET (void) {
;
}
/** SWDIO/TMS I/O pin: Set Output to Low.
Set the SWDIO/TMS DAP hardware I/O pin to low level.
*/
__STATIC_FORCEINLINE void PIN_SWDIO_TMS_CLR (void) {
;
}
/** SWDIO I/O pin: Get Input (used in SWD mode only).
\return Current status of the SWDIO DAP hardware I/O pin.
*/
__STATIC_FORCEINLINE uint32_t PIN_SWDIO_IN (void) {
return (0U);
}
/** SWDIO I/O pin: Set Output (used in SWD mode only).
\param bit Output value for the SWDIO DAP hardware I/O pin.
*/
__STATIC_FORCEINLINE void PIN_SWDIO_OUT (uint32_t bit) {
;
}
/** SWDIO I/O pin: Switch to Output mode (used in SWD mode only).
Configure the SWDIO DAP hardware I/O pin to output mode. This function is
called prior \ref PIN_SWDIO_OUT function calls.
*/
__STATIC_FORCEINLINE void PIN_SWDIO_OUT_ENABLE (void) {
;
}
/** SWDIO I/O pin: Switch to Input mode (used in SWD mode only).
Configure the SWDIO DAP hardware I/O pin to input mode. This function is
called prior \ref PIN_SWDIO_IN function calls.
*/
__STATIC_FORCEINLINE void PIN_SWDIO_OUT_DISABLE (void) {
;
}
// TDI Pin I/O ---------------------------------------------
/** TDI I/O pin: Get Input.
\return Current status of the TDI DAP hardware I/O pin.
*/
__STATIC_FORCEINLINE uint32_t PIN_TDI_IN (void) {
return (0U);
}
/** TDI I/O pin: Set Output.
\param bit Output value for the TDI DAP hardware I/O pin.
*/
__STATIC_FORCEINLINE void PIN_TDI_OUT (uint32_t bit) {
;
}
// TDO Pin I/O ---------------------------------------------
/** TDO I/O pin: Get Input.
\return Current status of the TDO DAP hardware I/O pin.
*/
__STATIC_FORCEINLINE uint32_t PIN_TDO_IN (void) {
return (0U);
}
// nTRST Pin I/O -------------------------------------------
/** nTRST I/O pin: Get Input.
\return Current status of the nTRST DAP hardware I/O pin.
*/
__STATIC_FORCEINLINE uint32_t PIN_nTRST_IN (void) {
return (0U);
}
/** nTRST I/O pin: Set Output.
\param bit JTAG TRST Test Reset pin status:
- 0: issue a JTAG TRST Test Reset.
- 1: release JTAG TRST Test Reset.
*/
__STATIC_FORCEINLINE void PIN_nTRST_OUT (uint32_t bit) {
;
}
// nRESET Pin I/O------------------------------------------
/** nRESET I/O pin: Get Input.
\return Current status of the nRESET DAP hardware I/O pin.
*/
__STATIC_FORCEINLINE uint32_t PIN_nRESET_IN (void) {
return (0U);
}
/** nRESET I/O pin: Set Output.
\param bit target device hardware reset pin status:
- 0: issue a device hardware reset.
- 1: release device hardware reset.
*/
__STATIC_FORCEINLINE void PIN_nRESET_OUT (uint32_t bit) {
;
}
///@}
//**************************************************************************************************
/**
\defgroup DAP_Config_LEDs_gr CMSIS-DAP Hardware Status LEDs
\ingroup DAP_ConfigIO_gr
@{
CMSIS-DAP Hardware may provide LEDs that indicate the status of the CMSIS-DAP Debug Unit.
It is recommended to provide the following LEDs for status indication:
- Connect LED: is active when the DAP hardware is connected to a debugger.
- Running LED: is active when the debugger has put the target device into running state.
*/
/** Debug Unit: Set status of Connected LED.
\param bit status of the Connect LED.
- 1: Connect LED ON: debugger is connected to CMSIS-DAP Debug Unit.
- 0: Connect LED OFF: debugger is not connected to CMSIS-DAP Debug Unit.
*/
__STATIC_INLINE void LED_CONNECTED_OUT (uint32_t bit) {}
/** Debug Unit: Set status Target Running LED.
\param bit status of the Target Running LED.
- 1: Target Running LED ON: program execution in target started.
- 0: Target Running LED OFF: program execution in target stopped.
*/
__STATIC_INLINE void LED_RUNNING_OUT (uint32_t bit) {}
///@}
//**************************************************************************************************
/**
\defgroup DAP_Config_Timestamp_gr CMSIS-DAP Timestamp
\ingroup DAP_ConfigIO_gr
@{
Access function for Test Domain Timer.
The value of the Test Domain Timer in the Debug Unit is returned by the function \ref TIMESTAMP_GET. By
default, the DWT timer is used. The frequency of this timer is configured with \ref TIMESTAMP_CLOCK.
*/
/** Get timestamp of Test Domain Timer.
\return Current timestamp value.
*/
__STATIC_INLINE uint32_t TIMESTAMP_GET (void) {
return (DWT->CYCCNT);
}
///@}
//**************************************************************************************************
/**
\defgroup DAP_Config_Initialization_gr CMSIS-DAP Initialization
\ingroup DAP_ConfigIO_gr
@{
CMSIS-DAP Hardware I/O and LED Pins are initialized with the function \ref DAP_SETUP.
*/
/** Setup of the Debug Unit I/O pins and LEDs (called when Debug Unit is initialized).
This function performs the initialization of the CMSIS-DAP Hardware I/O Pins and the
Status LEDs. In detail the operation of Hardware I/O and LED pins are enabled and set:
- I/O clock system enabled.
- all I/O pins: input buffer enabled, output pins are set to HighZ mode.
- for nTRST, nRESET a weak pull-up (if available) is enabled.
- LED output pins are enabled and LEDs are turned off.
*/
__STATIC_INLINE void DAP_SETUP (void) {
;
}
/** Reset Target Device with custom specific I/O pin or command sequence.
This function allows the optional implementation of a device specific reset sequence.
It is called when the command \ref DAP_ResetTarget and is for example required
when a device needs a time-critical unlock sequence that enables the debug port.
\return 0 = no device specific reset sequence is implemented.\n
1 = a device specific reset sequence is implemented.
*/
__STATIC_INLINE uint8_t RESET_TARGET (void) {
return (0U); // change to '1' when a device reset sequence is implemented
}
///@}
#endif /* __DAP_CONFIG_H__ */

View File

@ -1,367 +0,0 @@
/*
* Copyright (c) 2013-2022 ARM Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* ----------------------------------------------------------------------
*
* $Date: 26. April 2022
* $Revision: V2.1.1
*
* Project: CMSIS-DAP Include
* Title: DAP.h Definitions
*
*---------------------------------------------------------------------------*/
#ifndef __DAP_H__
#define __DAP_H__
// DAP Firmware Version
#ifdef DAP_FW_V1
#define DAP_FW_VER "1.3.0"
#else
#define DAP_FW_VER "2.1.1"
#endif
// DAP Command IDs
#define ID_DAP_Info 0x00U
#define ID_DAP_HostStatus 0x01U
#define ID_DAP_Connect 0x02U
#define ID_DAP_Disconnect 0x03U
#define ID_DAP_TransferConfigure 0x04U
#define ID_DAP_Transfer 0x05U
#define ID_DAP_TransferBlock 0x06U
#define ID_DAP_TransferAbort 0x07U
#define ID_DAP_WriteABORT 0x08U
#define ID_DAP_Delay 0x09U
#define ID_DAP_ResetTarget 0x0AU
#define ID_DAP_SWJ_Pins 0x10U
#define ID_DAP_SWJ_Clock 0x11U
#define ID_DAP_SWJ_Sequence 0x12U
#define ID_DAP_SWD_Configure 0x13U
#define ID_DAP_SWD_Sequence 0x1DU
#define ID_DAP_JTAG_Sequence 0x14U
#define ID_DAP_JTAG_Configure 0x15U
#define ID_DAP_JTAG_IDCODE 0x16U
#define ID_DAP_SWO_Transport 0x17U
#define ID_DAP_SWO_Mode 0x18U
#define ID_DAP_SWO_Baudrate 0x19U
#define ID_DAP_SWO_Control 0x1AU
#define ID_DAP_SWO_Status 0x1BU
#define ID_DAP_SWO_ExtendedStatus 0x1EU
#define ID_DAP_SWO_Data 0x1CU
#define ID_DAP_UART_Transport 0x1FU
#define ID_DAP_UART_Configure 0x20U
#define ID_DAP_UART_Control 0x22U
#define ID_DAP_UART_Status 0x23U
#define ID_DAP_UART_Transfer 0x21U
#define ID_DAP_QueueCommands 0x7EU
#define ID_DAP_ExecuteCommands 0x7FU
// DAP Vendor Command IDs
#define ID_DAP_Vendor0 0x80U
#define ID_DAP_Vendor1 0x81U
#define ID_DAP_Vendor2 0x82U
#define ID_DAP_Vendor3 0x83U
#define ID_DAP_Vendor4 0x84U
#define ID_DAP_Vendor5 0x85U
#define ID_DAP_Vendor6 0x86U
#define ID_DAP_Vendor7 0x87U
#define ID_DAP_Vendor8 0x88U
#define ID_DAP_Vendor9 0x89U
#define ID_DAP_Vendor10 0x8AU
#define ID_DAP_Vendor11 0x8BU
#define ID_DAP_Vendor12 0x8CU
#define ID_DAP_Vendor13 0x8DU
#define ID_DAP_Vendor14 0x8EU
#define ID_DAP_Vendor15 0x8FU
#define ID_DAP_Vendor16 0x90U
#define ID_DAP_Vendor17 0x91U
#define ID_DAP_Vendor18 0x92U
#define ID_DAP_Vendor19 0x93U
#define ID_DAP_Vendor20 0x94U
#define ID_DAP_Vendor21 0x95U
#define ID_DAP_Vendor22 0x96U
#define ID_DAP_Vendor23 0x97U
#define ID_DAP_Vendor24 0x98U
#define ID_DAP_Vendor25 0x99U
#define ID_DAP_Vendor26 0x9AU
#define ID_DAP_Vendor27 0x9BU
#define ID_DAP_Vendor28 0x9CU
#define ID_DAP_Vendor29 0x9DU
#define ID_DAP_Vendor30 0x9EU
#define ID_DAP_Vendor31 0x9FU
#define ID_DAP_Invalid 0xFFU
// DAP Status Code
#define DAP_OK 0U
#define DAP_ERROR 0xFFU
// DAP ID
#define DAP_ID_VENDOR 1U
#define DAP_ID_PRODUCT 2U
#define DAP_ID_SER_NUM 3U
#define DAP_ID_DAP_FW_VER 4U
#define DAP_ID_DEVICE_VENDOR 5U
#define DAP_ID_DEVICE_NAME 6U
#define DAP_ID_BOARD_VENDOR 7U
#define DAP_ID_BOARD_NAME 8U
#define DAP_ID_PRODUCT_FW_VER 9U
#define DAP_ID_CAPABILITIES 0xF0U
#define DAP_ID_TIMESTAMP_CLOCK 0xF1U
#define DAP_ID_UART_RX_BUFFER_SIZE 0xFBU
#define DAP_ID_UART_TX_BUFFER_SIZE 0xFCU
#define DAP_ID_SWO_BUFFER_SIZE 0xFDU
#define DAP_ID_PACKET_COUNT 0xFEU
#define DAP_ID_PACKET_SIZE 0xFFU
// DAP Host Status
#define DAP_DEBUGGER_CONNECTED 0U
#define DAP_TARGET_RUNNING 1U
// DAP Port
#define DAP_PORT_AUTODETECT 0U // Autodetect Port
#define DAP_PORT_DISABLED 0U // Port Disabled (I/O pins in High-Z)
#define DAP_PORT_SWD 1U // SWD Port (SWCLK, SWDIO) + nRESET
#define DAP_PORT_JTAG 2U // JTAG Port (TCK, TMS, TDI, TDO, nTRST) + nRESET
// DAP SWJ Pins
#define DAP_SWJ_SWCLK_TCK 0 // SWCLK/TCK
#define DAP_SWJ_SWDIO_TMS 1 // SWDIO/TMS
#define DAP_SWJ_TDI 2 // TDI
#define DAP_SWJ_TDO 3 // TDO
#define DAP_SWJ_nTRST 5 // nTRST
#define DAP_SWJ_nRESET 7 // nRESET
// DAP Transfer Request
#define DAP_TRANSFER_APnDP (1U<<0)
#define DAP_TRANSFER_RnW (1U<<1)
#define DAP_TRANSFER_A2 (1U<<2)
#define DAP_TRANSFER_A3 (1U<<3)
#define DAP_TRANSFER_MATCH_VALUE (1U<<4)
#define DAP_TRANSFER_MATCH_MASK (1U<<5)
#define DAP_TRANSFER_TIMESTAMP (1U<<7)
// DAP Transfer Response
#define DAP_TRANSFER_OK (1U<<0)
#define DAP_TRANSFER_WAIT (1U<<1)
#define DAP_TRANSFER_FAULT (1U<<2)
#define DAP_TRANSFER_ERROR (1U<<3)
#define DAP_TRANSFER_MISMATCH (1U<<4)
// DAP SWO Trace Mode
#define DAP_SWO_OFF 0U
#define DAP_SWO_UART 1U
#define DAP_SWO_MANCHESTER 2U
// DAP SWO Trace Status
#define DAP_SWO_CAPTURE_ACTIVE (1U<<0)
#define DAP_SWO_CAPTURE_PAUSED (1U<<1)
#define DAP_SWO_STREAM_ERROR (1U<<6)
#define DAP_SWO_BUFFER_OVERRUN (1U<<7)
// DAP UART Transport
#define DAP_UART_TRANSPORT_NONE 0U
#define DAP_UART_TRANSPORT_USB_COM_PORT 1U
#define DAP_UART_TRANSPORT_DAP_COMMAND 2U
// DAP UART Control
#define DAP_UART_CONTROL_RX_ENABLE (1U<<0)
#define DAP_UART_CONTROL_RX_DISABLE (1U<<1)
#define DAP_UART_CONTROL_RX_BUF_FLUSH (1U<<2)
#define DAP_UART_CONTROL_TX_ENABLE (1U<<4)
#define DAP_UART_CONTROL_TX_DISABLE (1U<<5)
#define DAP_UART_CONTROL_TX_BUF_FLUSH (1U<<6)
// DAP UART Status
#define DAP_UART_STATUS_RX_ENABLED (1U<<0)
#define DAP_UART_STATUS_RX_DATA_LOST (1U<<1)
#define DAP_UART_STATUS_FRAMING_ERROR (1U<<2)
#define DAP_UART_STATUS_PARITY_ERROR (1U<<3)
#define DAP_UART_STATUS_TX_ENABLED (1U<<4)
// DAP UART Configure Error
#define DAP_UART_CFG_ERROR_DATA_BITS (1U<<0)
#define DAP_UART_CFG_ERROR_PARITY (1U<<1)
#define DAP_UART_CFG_ERROR_STOP_BITS (1U<<2)
// Debug Port Register Addresses
#define DP_IDCODE 0x00U // IDCODE Register (SW Read only)
#define DP_ABORT 0x00U // Abort Register (SW Write only)
#define DP_CTRL_STAT 0x04U // Control & Status
#define DP_WCR 0x04U // Wire Control Register (SW Only)
#define DP_SELECT 0x08U // Select Register (JTAG R/W & SW W)
#define DP_RESEND 0x08U // Resend (SW Read Only)
#define DP_RDBUFF 0x0CU // Read Buffer (Read Only)
// JTAG IR Codes
#define JTAG_ABORT 0x08U
#define JTAG_DPACC 0x0AU
#define JTAG_APACC 0x0BU
#define JTAG_IDCODE 0x0EU
#define JTAG_BYPASS 0x0FU
// JTAG Sequence Info
#define JTAG_SEQUENCE_TCK 0x3FU // TCK count
#define JTAG_SEQUENCE_TMS 0x40U // TMS value
#define JTAG_SEQUENCE_TDO 0x80U // TDO capture
// SWD Sequence Info
#define SWD_SEQUENCE_CLK 0x3FU // SWCLK count
#define SWD_SEQUENCE_DIN 0x80U // SWDIO capture
#include <stddef.h>
#include <stdint.h>
#include "cmsis_compiler.h"
// DAP Data structure
typedef struct {
uint8_t debug_port; // Debug Port
uint8_t fast_clock; // Fast Clock Flag
uint8_t padding[2];
uint32_t clock_delay; // Clock Delay
uint32_t timestamp; // Last captured Timestamp
struct { // Transfer Configuration
uint8_t idle_cycles; // Idle cycles after transfer
uint8_t padding[3];
uint16_t retry_count; // Number of retries after WAIT response
uint16_t match_retry; // Number of retries if read value does not match
uint32_t match_mask; // Match Mask
} transfer;
#if (DAP_SWD != 0)
struct { // SWD Configuration
uint8_t turnaround; // Turnaround period
uint8_t data_phase; // Always generate Data Phase
} swd_conf;
#endif
#if (DAP_JTAG != 0)
struct { // JTAG Device Chain
uint8_t count; // Number of devices
uint8_t index; // Device index (device at TDO has index 0)
#if (DAP_JTAG_DEV_CNT != 0)
uint8_t ir_length[DAP_JTAG_DEV_CNT]; // IR Length in bits
uint16_t ir_before[DAP_JTAG_DEV_CNT]; // Bits before IR
uint16_t ir_after [DAP_JTAG_DEV_CNT]; // Bits after IR
#endif
} jtag_dev;
#endif
} DAP_Data_t;
extern DAP_Data_t DAP_Data; // DAP Data
extern volatile uint8_t DAP_TransferAbort; // Transfer Abort Flag
#ifdef __cplusplus
extern "C"
{
#endif
// Functions
extern void SWJ_Sequence (uint32_t count, const uint8_t *data);
extern void SWD_Sequence (uint32_t info, const uint8_t *swdo, uint8_t *swdi);
extern void JTAG_Sequence (uint32_t info, const uint8_t *tdi, uint8_t *tdo);
extern void JTAG_IR (uint32_t ir);
extern uint32_t JTAG_ReadIDCode (void);
extern void JTAG_WriteAbort (uint32_t data);
extern uint8_t JTAG_Transfer (uint32_t request, uint32_t *data);
extern uint8_t SWD_Transfer (uint32_t request, uint32_t *data);
extern void Delayms (uint32_t delay);
extern uint32_t SWO_Transport (const uint8_t *request, uint8_t *response);
extern uint32_t SWO_Mode (const uint8_t *request, uint8_t *response);
extern uint32_t SWO_Baudrate (const uint8_t *request, uint8_t *response);
extern uint32_t SWO_Control (const uint8_t *request, uint8_t *response);
extern uint32_t SWO_Status (uint8_t *response);
extern uint32_t SWO_ExtendedStatus (const uint8_t *request, uint8_t *response);
extern uint32_t SWO_Data (const uint8_t *request, uint8_t *response);
extern void SWO_QueueTransfer (uint8_t *buf, uint32_t num);
extern void SWO_AbortTransfer (void);
extern void SWO_TransferComplete (void);
extern uint32_t SWO_Mode_UART (uint32_t enable);
extern uint32_t SWO_Baudrate_UART (uint32_t baudrate);
extern uint32_t SWO_Control_UART (uint32_t active);
extern void SWO_Capture_UART (uint8_t *buf, uint32_t num);
extern uint32_t SWO_GetCount_UART (void);
extern uint32_t SWO_Mode_Manchester (uint32_t enable);
extern uint32_t SWO_Baudrate_Manchester (uint32_t baudrate);
extern uint32_t SWO_Control_Manchester (uint32_t active);
extern void SWO_Capture_Manchester (uint8_t *buf, uint32_t num);
extern uint32_t SWO_GetCount_Manchester (void);
extern uint32_t UART_Transport (const uint8_t *request, uint8_t *response);
extern uint32_t UART_Configure (const uint8_t *request, uint8_t *response);
extern uint32_t UART_Control (const uint8_t *request, uint8_t *response);
extern uint32_t UART_Status (uint8_t *response);
extern uint32_t UART_Transfer (const uint8_t *request, uint8_t *response);
extern uint8_t USB_COM_PORT_Activate (uint32_t cmd);
extern uint32_t DAP_ProcessVendorCommand (const uint8_t *request, uint8_t *response);
extern uint32_t DAP_ProcessCommand (const uint8_t *request, uint8_t *response);
extern uint32_t DAP_ExecuteCommand (const uint8_t *request, uint8_t *response);
extern void DAP_Setup (void);
// Configurable delay for clock generation
#ifndef DELAY_SLOW_CYCLES
#define DELAY_SLOW_CYCLES 3U // Number of cycles for one iteration
#endif
#if defined(__CC_ARM)
__STATIC_FORCEINLINE void PIN_DELAY_SLOW (uint32_t delay) {
uint32_t count = delay;
while (--count);
}
#else
__STATIC_FORCEINLINE void PIN_DELAY_SLOW (uint32_t delay) {
__ASM volatile (
".syntax unified\n"
"0:\n\t"
"subs %0,%0,#1\n\t"
"bne 0b\n"
: "+l" (delay) : : "cc"
);
}
#endif
// Fixed delay for fast clock generation
#ifndef DELAY_FAST_CYCLES
#define DELAY_FAST_CYCLES 0U // Number of cycles: 0..3
#endif
__STATIC_FORCEINLINE void PIN_DELAY_FAST (void) {
#if (DELAY_FAST_CYCLES >= 1U)
__NOP();
#endif
#if (DELAY_FAST_CYCLES >= 2U)
__NOP();
#endif
#if (DELAY_FAST_CYCLES >= 3U)
__NOP();
#endif
}
#ifdef __cplusplus
}
#endif
#endif /* __DAP_H__ */

File diff suppressed because it is too large Load Diff

View File

@ -1,100 +0,0 @@
/*
* Copyright (c) 2013-2017 ARM Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* ----------------------------------------------------------------------
*
* $Date: 1. December 2017
* $Revision: V2.0.0
*
* Project: CMSIS-DAP Source
* Title: DAP_vendor.c CMSIS-DAP Vendor Commands
*
*---------------------------------------------------------------------------*/
#include "DAP_config.h"
#include "DAP.h"
//**************************************************************************************************
/**
\defgroup DAP_Vendor_Adapt_gr Adapt Vendor Commands
\ingroup DAP_Vendor_gr
@{
The file DAP_vendor.c provides template source code for extension of a Debug Unit with
Vendor Commands. Copy this file to the project folder of the Debug Unit and add the
file to the MDK-ARM project under the file group Configuration.
*/
/** Process DAP Vendor Command and prepare Response Data
\param request pointer to request data
\param response pointer to response data
\return number of bytes in response (lower 16 bits)
number of bytes in request (upper 16 bits)
*/
uint32_t DAP_ProcessVendorCommand(const uint8_t *request, uint8_t *response) {
uint32_t num = (1U << 16) | 1U;
*response++ = *request; // copy Command ID
switch (*request++) { // first byte in request is Command ID
case ID_DAP_Vendor0:
#if 0 // example user command
num += 1U << 16; // increment request count
if (*request == 1U) { // when first command data byte is 1
*response++ = 'X'; // send 'X' as response
num++; // increment response count
}
#endif
break;
case ID_DAP_Vendor1: break;
case ID_DAP_Vendor2: break;
case ID_DAP_Vendor3: break;
case ID_DAP_Vendor4: break;
case ID_DAP_Vendor5: break;
case ID_DAP_Vendor6: break;
case ID_DAP_Vendor7: break;
case ID_DAP_Vendor8: break;
case ID_DAP_Vendor9: break;
case ID_DAP_Vendor10: break;
case ID_DAP_Vendor11: break;
case ID_DAP_Vendor12: break;
case ID_DAP_Vendor13: break;
case ID_DAP_Vendor14: break;
case ID_DAP_Vendor15: break;
case ID_DAP_Vendor16: break;
case ID_DAP_Vendor17: break;
case ID_DAP_Vendor18: break;
case ID_DAP_Vendor19: break;
case ID_DAP_Vendor20: break;
case ID_DAP_Vendor21: break;
case ID_DAP_Vendor22: break;
case ID_DAP_Vendor23: break;
case ID_DAP_Vendor24: break;
case ID_DAP_Vendor25: break;
case ID_DAP_Vendor26: break;
case ID_DAP_Vendor27: break;
case ID_DAP_Vendor28: break;
case ID_DAP_Vendor29: break;
case ID_DAP_Vendor30: break;
case ID_DAP_Vendor31: break;
}
return (num);
}
///@}

View File

@ -1,370 +0,0 @@
/*
* Copyright (c) 2013-2017 ARM Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* ----------------------------------------------------------------------
*
* $Date: 1. December 2017
* $Revision: V2.0.0
*
* Project: CMSIS-DAP Source
* Title: JTAG_DP.c CMSIS-DAP JTAG DP I/O
*
*---------------------------------------------------------------------------*/
#include "DAP_config.h"
#include "DAP.h"
// JTAG Macros
#define PIN_TCK_SET PIN_SWCLK_TCK_SET
#define PIN_TCK_CLR PIN_SWCLK_TCK_CLR
#define PIN_TMS_SET PIN_SWDIO_TMS_SET
#define PIN_TMS_CLR PIN_SWDIO_TMS_CLR
#define JTAG_CYCLE_TCK() \
PIN_TCK_CLR(); \
PIN_DELAY(); \
PIN_TCK_SET(); \
PIN_DELAY()
#define JTAG_CYCLE_TDI(tdi) \
PIN_TDI_OUT(tdi); \
PIN_TCK_CLR(); \
PIN_DELAY(); \
PIN_TCK_SET(); \
PIN_DELAY()
#define JTAG_CYCLE_TDO(tdo) \
PIN_TCK_CLR(); \
PIN_DELAY(); \
tdo = PIN_TDO_IN(); \
PIN_TCK_SET(); \
PIN_DELAY()
#define JTAG_CYCLE_TDIO(tdi,tdo) \
PIN_TDI_OUT(tdi); \
PIN_TCK_CLR(); \
PIN_DELAY(); \
tdo = PIN_TDO_IN(); \
PIN_TCK_SET(); \
PIN_DELAY()
#define PIN_DELAY() PIN_DELAY_SLOW(DAP_Data.clock_delay)
#if (DAP_JTAG != 0)
// Generate JTAG Sequence
// info: sequence information
// tdi: pointer to TDI generated data
// tdo: pointer to TDO captured data
// return: none
void JTAG_Sequence (uint32_t info, const uint8_t *tdi, uint8_t *tdo) {
uint32_t i_val;
uint32_t o_val;
uint32_t bit;
uint32_t n, k;
n = info & JTAG_SEQUENCE_TCK;
if (n == 0U) {
n = 64U;
}
if (info & JTAG_SEQUENCE_TMS) {
PIN_TMS_SET();
} else {
PIN_TMS_CLR();
}
while (n) {
i_val = *tdi++;
o_val = 0U;
for (k = 8U; k && n; k--, n--) {
JTAG_CYCLE_TDIO(i_val, bit);
i_val >>= 1;
o_val >>= 1;
o_val |= bit << 7;
}
o_val >>= k;
if (info & JTAG_SEQUENCE_TDO) {
*tdo++ = (uint8_t)o_val;
}
}
}
// JTAG Set IR
// ir: IR value
// return: none
#define JTAG_IR_Function(speed) /**/ \
static void JTAG_IR_##speed (uint32_t ir) { \
uint32_t n; \
\
PIN_TMS_SET(); \
JTAG_CYCLE_TCK(); /* Select-DR-Scan */ \
JTAG_CYCLE_TCK(); /* Select-IR-Scan */ \
PIN_TMS_CLR(); \
JTAG_CYCLE_TCK(); /* Capture-IR */ \
JTAG_CYCLE_TCK(); /* Shift-IR */ \
\
PIN_TDI_OUT(1U); \
for (n = DAP_Data.jtag_dev.ir_before[DAP_Data.jtag_dev.index]; n; n--) { \
JTAG_CYCLE_TCK(); /* Bypass before data */ \
} \
for (n = DAP_Data.jtag_dev.ir_length[DAP_Data.jtag_dev.index] - 1U; n; n--) { \
JTAG_CYCLE_TDI(ir); /* Set IR bits (except last) */ \
ir >>= 1; \
} \
n = DAP_Data.jtag_dev.ir_after[DAP_Data.jtag_dev.index]; \
if (n) { \
JTAG_CYCLE_TDI(ir); /* Set last IR bit */ \
PIN_TDI_OUT(1U); \
for (--n; n; n--) { \
JTAG_CYCLE_TCK(); /* Bypass after data */ \
} \
PIN_TMS_SET(); \
JTAG_CYCLE_TCK(); /* Bypass & Exit1-IR */ \
} else { \
PIN_TMS_SET(); \
JTAG_CYCLE_TDI(ir); /* Set last IR bit & Exit1-IR */ \
} \
\
JTAG_CYCLE_TCK(); /* Update-IR */ \
PIN_TMS_CLR(); \
JTAG_CYCLE_TCK(); /* Idle */ \
PIN_TDI_OUT(1U); \
}
// JTAG Transfer I/O
// request: A[3:2] RnW APnDP
// data: DATA[31:0]
// return: ACK[2:0]
#define JTAG_TransferFunction(speed) /**/ \
static uint8_t JTAG_Transfer##speed (uint32_t request, uint32_t *data) { \
uint32_t ack; \
uint32_t bit; \
uint32_t val; \
uint32_t n; \
\
PIN_TMS_SET(); \
JTAG_CYCLE_TCK(); /* Select-DR-Scan */ \
PIN_TMS_CLR(); \
JTAG_CYCLE_TCK(); /* Capture-DR */ \
JTAG_CYCLE_TCK(); /* Shift-DR */ \
\
for (n = DAP_Data.jtag_dev.index; n; n--) { \
JTAG_CYCLE_TCK(); /* Bypass before data */ \
} \
\
JTAG_CYCLE_TDIO(request >> 1, bit); /* Set RnW, Get ACK.0 */ \
ack = bit << 1; \
JTAG_CYCLE_TDIO(request >> 2, bit); /* Set A2, Get ACK.1 */ \
ack |= bit << 0; \
JTAG_CYCLE_TDIO(request >> 3, bit); /* Set A3, Get ACK.2 */ \
ack |= bit << 2; \
\
if (ack != DAP_TRANSFER_OK) { \
/* Exit on error */ \
PIN_TMS_SET(); \
JTAG_CYCLE_TCK(); /* Exit1-DR */ \
goto exit; \
} \
\
if (request & DAP_TRANSFER_RnW) { \
/* Read Transfer */ \
val = 0U; \
for (n = 31U; n; n--) { \
JTAG_CYCLE_TDO(bit); /* Get D0..D30 */ \
val |= bit << 31; \
val >>= 1; \
} \
n = DAP_Data.jtag_dev.count - DAP_Data.jtag_dev.index - 1U; \
if (n) { \
JTAG_CYCLE_TDO(bit); /* Get D31 */ \
for (--n; n; n--) { \
JTAG_CYCLE_TCK(); /* Bypass after data */ \
} \
PIN_TMS_SET(); \
JTAG_CYCLE_TCK(); /* Bypass & Exit1-DR */ \
} else { \
PIN_TMS_SET(); \
JTAG_CYCLE_TDO(bit); /* Get D31 & Exit1-DR */ \
} \
val |= bit << 31; \
if (data) { *data = val; } \
} else { \
/* Write Transfer */ \
val = *data; \
for (n = 31U; n; n--) { \
JTAG_CYCLE_TDI(val); /* Set D0..D30 */ \
val >>= 1; \
} \
n = DAP_Data.jtag_dev.count - DAP_Data.jtag_dev.index - 1U; \
if (n) { \
JTAG_CYCLE_TDI(val); /* Set D31 */ \
for (--n; n; n--) { \
JTAG_CYCLE_TCK(); /* Bypass after data */ \
} \
PIN_TMS_SET(); \
JTAG_CYCLE_TCK(); /* Bypass & Exit1-DR */ \
} else { \
PIN_TMS_SET(); \
JTAG_CYCLE_TDI(val); /* Set D31 & Exit1-DR */ \
} \
} \
\
exit: \
JTAG_CYCLE_TCK(); /* Update-DR */ \
PIN_TMS_CLR(); \
JTAG_CYCLE_TCK(); /* Idle */ \
PIN_TDI_OUT(1U); \
\
/* Capture Timestamp */ \
if (request & DAP_TRANSFER_TIMESTAMP) { \
DAP_Data.timestamp = TIMESTAMP_GET(); \
} \
\
/* Idle cycles */ \
n = DAP_Data.transfer.idle_cycles; \
while (n--) { \
JTAG_CYCLE_TCK(); /* Idle */ \
} \
\
return ((uint8_t)ack); \
}
#undef PIN_DELAY
#define PIN_DELAY() PIN_DELAY_FAST()
JTAG_IR_Function(Fast)
JTAG_TransferFunction(Fast)
#undef PIN_DELAY
#define PIN_DELAY() PIN_DELAY_SLOW(DAP_Data.clock_delay)
JTAG_IR_Function(Slow)
JTAG_TransferFunction(Slow)
// JTAG Read IDCODE register
// return: value read
uint32_t JTAG_ReadIDCode (void) {
uint32_t bit;
uint32_t val;
uint32_t n;
PIN_TMS_SET();
JTAG_CYCLE_TCK(); /* Select-DR-Scan */
PIN_TMS_CLR();
JTAG_CYCLE_TCK(); /* Capture-DR */
JTAG_CYCLE_TCK(); /* Shift-DR */
for (n = DAP_Data.jtag_dev.index; n; n--) {
JTAG_CYCLE_TCK(); /* Bypass before data */
}
val = 0U;
for (n = 31U; n; n--) {
JTAG_CYCLE_TDO(bit); /* Get D0..D30 */
val |= bit << 31;
val >>= 1;
}
PIN_TMS_SET();
JTAG_CYCLE_TDO(bit); /* Get D31 & Exit1-DR */
val |= bit << 31;
JTAG_CYCLE_TCK(); /* Update-DR */
PIN_TMS_CLR();
JTAG_CYCLE_TCK(); /* Idle */
return (val);
}
// JTAG Write ABORT register
// data: value to write
// return: none
void JTAG_WriteAbort (uint32_t data) {
uint32_t n;
PIN_TMS_SET();
JTAG_CYCLE_TCK(); /* Select-DR-Scan */
PIN_TMS_CLR();
JTAG_CYCLE_TCK(); /* Capture-DR */
JTAG_CYCLE_TCK(); /* Shift-DR */
for (n = DAP_Data.jtag_dev.index; n; n--) {
JTAG_CYCLE_TCK(); /* Bypass before data */
}
PIN_TDI_OUT(0U);
JTAG_CYCLE_TCK(); /* Set RnW=0 (Write) */
JTAG_CYCLE_TCK(); /* Set A2=0 */
JTAG_CYCLE_TCK(); /* Set A3=0 */
for (n = 31U; n; n--) {
JTAG_CYCLE_TDI(data); /* Set D0..D30 */
data >>= 1;
}
n = DAP_Data.jtag_dev.count - DAP_Data.jtag_dev.index - 1U;
if (n) {
JTAG_CYCLE_TDI(data); /* Set D31 */
for (--n; n; n--) {
JTAG_CYCLE_TCK(); /* Bypass after data */
}
PIN_TMS_SET();
JTAG_CYCLE_TCK(); /* Bypass & Exit1-DR */
} else {
PIN_TMS_SET();
JTAG_CYCLE_TDI(data); /* Set D31 & Exit1-DR */
}
JTAG_CYCLE_TCK(); /* Update-DR */
PIN_TMS_CLR();
JTAG_CYCLE_TCK(); /* Idle */
PIN_TDI_OUT(1U);
}
// JTAG Set IR
// ir: IR value
// return: none
void JTAG_IR (uint32_t ir) {
if (DAP_Data.fast_clock) {
JTAG_IR_Fast(ir);
} else {
JTAG_IR_Slow(ir);
}
}
// JTAG Transfer I/O
// request: A[3:2] RnW APnDP
// data: DATA[31:0]
// return: ACK[2:0]
uint8_t JTAG_Transfer(uint32_t request, uint32_t *data) {
if (DAP_Data.fast_clock) {
return JTAG_TransferFast(request, data);
} else {
return JTAG_TransferSlow(request, data);
}
}
#endif /* (DAP_JTAG != 0) */

View File

@ -1,798 +0,0 @@
/*
* Copyright (c) 2013-2021 ARM Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* ----------------------------------------------------------------------
*
* $Date: 29. March 2021
* $Revision: V2.0.1
*
* Project: CMSIS-DAP Source
* Title: SWO.c CMSIS-DAP SWO I/O
*
*---------------------------------------------------------------------------*/
#include "DAP_config.h"
#include "DAP.h"
#if (SWO_UART != 0)
#include "Driver_USART.h"
#endif
#if (SWO_STREAM != 0)
#include "cmsis_os2.h"
#define osObjectsExternal
#include "osObjects.h"
#endif
#if (SWO_STREAM != 0)
#ifdef DAP_FW_V1
#error "SWO Streaming Trace not supported in DAP V1!"
#endif
#endif
#if (SWO_UART != 0)
// USART Driver
#define _USART_Driver_(n) Driver_USART##n
#define USART_Driver_(n) _USART_Driver_(n)
extern ARM_DRIVER_USART USART_Driver_(SWO_UART_DRIVER);
#define pUSART (&USART_Driver_(SWO_UART_DRIVER))
static uint8_t USART_Ready = 0U;
#endif /* (SWO_UART != 0) */
#if ((SWO_UART != 0) || (SWO_MANCHESTER != 0))
#define SWO_STREAM_TIMEOUT 50U /* Stream timeout in ms */
#define USB_BLOCK_SIZE 512U /* USB Block Size */
#define TRACE_BLOCK_SIZE 64U /* Trace Block Size (2^n: 32...512) */
// Trace State
static uint8_t TraceTransport = 0U; /* Trace Transport */
static uint8_t TraceMode = 0U; /* Trace Mode */
static uint8_t TraceStatus = 0U; /* Trace Status without Errors */
static uint8_t TraceError[2] = {0U, 0U}; /* Trace Error flags (banked) */
static uint8_t TraceError_n = 0U; /* Active Trace Error bank */
// Trace Buffer
static uint8_t TraceBuf[SWO_BUFFER_SIZE]; /* Trace Buffer (must be 2^n) */
static volatile uint32_t TraceIndexI = 0U; /* Incoming Trace Index */
static volatile uint32_t TraceIndexO = 0U; /* Outgoing Trace Index */
static volatile uint8_t TraceUpdate; /* Trace Update Flag */
static uint32_t TraceBlockSize; /* Current Trace Block Size */
#if (TIMESTAMP_CLOCK != 0U)
// Trace Timestamp
static volatile struct {
uint32_t index;
uint32_t tick;
} TraceTimestamp;
#endif
// Trace Helper functions
static void ClearTrace (void);
static void ResumeTrace (void);
static uint32_t GetTraceCount (void);
static uint8_t GetTraceStatus (void);
static void SetTraceError (uint8_t flag);
#if (SWO_STREAM != 0)
extern osThreadId_t SWO_ThreadId;
static volatile uint8_t TransferBusy = 0U; /* Transfer Busy Flag */
static uint32_t TransferSize; /* Current Transfer Size */
#endif
#if (SWO_UART != 0)
// USART Driver Callback function
// event: event mask
static void USART_Callback (uint32_t event) {
uint32_t index_i;
uint32_t index_o;
uint32_t count;
uint32_t num;
if (event & ARM_USART_EVENT_RECEIVE_COMPLETE) {
#if (TIMESTAMP_CLOCK != 0U)
TraceTimestamp.tick = TIMESTAMP_GET();
#endif
index_o = TraceIndexO;
index_i = TraceIndexI;
index_i += TraceBlockSize;
TraceIndexI = index_i;
#if (TIMESTAMP_CLOCK != 0U)
TraceTimestamp.index = index_i;
#endif
num = TRACE_BLOCK_SIZE - (index_i & (TRACE_BLOCK_SIZE - 1U));
count = index_i - index_o;
if (count <= (SWO_BUFFER_SIZE - num)) {
index_i &= SWO_BUFFER_SIZE - 1U;
TraceBlockSize = num;
pUSART->Receive(&TraceBuf[index_i], num);
} else {
TraceStatus = DAP_SWO_CAPTURE_ACTIVE | DAP_SWO_CAPTURE_PAUSED;
}
TraceUpdate = 1U;
#if (SWO_STREAM != 0)
if (TraceTransport == 2U) {
if (count >= (USB_BLOCK_SIZE - (index_o & (USB_BLOCK_SIZE - 1U)))) {
osThreadFlagsSet(SWO_ThreadId, 1U);
}
}
#endif
}
if (event & ARM_USART_EVENT_RX_OVERFLOW) {
SetTraceError(DAP_SWO_BUFFER_OVERRUN);
}
if (event & (ARM_USART_EVENT_RX_BREAK |
ARM_USART_EVENT_RX_FRAMING_ERROR |
ARM_USART_EVENT_RX_PARITY_ERROR)) {
SetTraceError(DAP_SWO_STREAM_ERROR);
}
}
// Enable or disable SWO Mode (UART)
// enable: enable flag
// return: 1 - Success, 0 - Error
__WEAK uint32_t SWO_Mode_UART (uint32_t enable) {
int32_t status;
USART_Ready = 0U;
if (enable != 0U) {
status = pUSART->Initialize(USART_Callback);
if (status != ARM_DRIVER_OK) {
return (0U);
}
status = pUSART->PowerControl(ARM_POWER_FULL);
if (status != ARM_DRIVER_OK) {
pUSART->Uninitialize();
return (0U);
}
} else {
pUSART->Control(ARM_USART_CONTROL_RX, 0U);
pUSART->Control(ARM_USART_ABORT_RECEIVE, 0U);
pUSART->PowerControl(ARM_POWER_OFF);
pUSART->Uninitialize();
}
return (1U);
}
// Configure SWO Baudrate (UART)
// baudrate: requested baudrate
// return: actual baudrate or 0 when not configured
__WEAK uint32_t SWO_Baudrate_UART (uint32_t baudrate) {
int32_t status;
uint32_t index;
uint32_t num;
if (baudrate > SWO_UART_MAX_BAUDRATE) {
baudrate = SWO_UART_MAX_BAUDRATE;
}
if (TraceStatus & DAP_SWO_CAPTURE_ACTIVE) {
pUSART->Control(ARM_USART_CONTROL_RX, 0U);
if (pUSART->GetStatus().rx_busy) {
TraceIndexI += pUSART->GetRxCount();
pUSART->Control(ARM_USART_ABORT_RECEIVE, 0U);
}
}
status = pUSART->Control(ARM_USART_MODE_ASYNCHRONOUS |
ARM_USART_DATA_BITS_8 |
ARM_USART_PARITY_NONE |
ARM_USART_STOP_BITS_1,
baudrate);
if (status == ARM_DRIVER_OK) {
USART_Ready = 1U;
} else {
USART_Ready = 0U;
return (0U);
}
if (TraceStatus & DAP_SWO_CAPTURE_ACTIVE) {
if ((TraceStatus & DAP_SWO_CAPTURE_PAUSED) == 0U) {
index = TraceIndexI & (SWO_BUFFER_SIZE - 1U);
num = TRACE_BLOCK_SIZE - (index & (TRACE_BLOCK_SIZE - 1U));
TraceBlockSize = num;
pUSART->Receive(&TraceBuf[index], num);
}
pUSART->Control(ARM_USART_CONTROL_RX, 1U);
}
return (baudrate);
}
// Control SWO Capture (UART)
// active: active flag
// return: 1 - Success, 0 - Error
__WEAK uint32_t SWO_Control_UART (uint32_t active) {
int32_t status;
if (active) {
if (!USART_Ready) {
return (0U);
}
TraceBlockSize = 1U;
status = pUSART->Receive(&TraceBuf[0], 1U);
if (status != ARM_DRIVER_OK) {
return (0U);
}
status = pUSART->Control(ARM_USART_CONTROL_RX, 1U);
if (status != ARM_DRIVER_OK) {
return (0U);
}
} else {
pUSART->Control(ARM_USART_CONTROL_RX, 0U);
if (pUSART->GetStatus().rx_busy) {
TraceIndexI += pUSART->GetRxCount();
pUSART->Control(ARM_USART_ABORT_RECEIVE, 0U);
}
}
return (1U);
}
// Start SWO Capture (UART)
// buf: pointer to buffer for capturing
// num: number of bytes to capture
__WEAK void SWO_Capture_UART (uint8_t *buf, uint32_t num) {
TraceBlockSize = num;
pUSART->Receive(buf, num);
}
// Get SWO Pending Trace Count (UART)
// return: number of pending trace data bytes
__WEAK uint32_t SWO_GetCount_UART (void) {
uint32_t count;
if (pUSART->GetStatus().rx_busy) {
count = pUSART->GetRxCount();
} else {
count = 0U;
}
return (count);
}
#endif /* (SWO_UART != 0) */
#if (SWO_MANCHESTER != 0)
// Enable or disable SWO Mode (Manchester)
// enable: enable flag
// return: 1 - Success, 0 - Error
__WEAK uint32_t SWO_Mode_Manchester (uint32_t enable) {
return (0U);
}
// Configure SWO Baudrate (Manchester)
// baudrate: requested baudrate
// return: actual baudrate or 0 when not configured
__WEAK uint32_t SWO_Baudrate_Manchester (uint32_t baudrate) {
return (0U);
}
// Control SWO Capture (Manchester)
// active: active flag
// return: 1 - Success, 0 - Error
__WEAK uint32_t SWO_Control_Manchester (uint32_t active) {
return (0U);
}
// Start SWO Capture (Manchester)
// buf: pointer to buffer for capturing
// num: number of bytes to capture
__WEAK void SWO_Capture_Manchester (uint8_t *buf, uint32_t num) {
}
// Get SWO Pending Trace Count (Manchester)
// return: number of pending trace data bytes
__WEAK uint32_t SWO_GetCount_Manchester (void) {
}
#endif /* (SWO_MANCHESTER != 0) */
// Clear Trace Errors and Data
static void ClearTrace (void) {
#if (SWO_STREAM != 0)
if (TraceTransport == 2U) {
if (TransferBusy != 0U) {
SWO_AbortTransfer();
TransferBusy = 0U;
}
}
#endif
TraceError[0] = 0U;
TraceError[1] = 0U;
TraceError_n = 0U;
TraceIndexI = 0U;
TraceIndexO = 0U;
#if (TIMESTAMP_CLOCK != 0U)
TraceTimestamp.index = 0U;
TraceTimestamp.tick = 0U;
#endif
}
// Resume Trace Capture
static void ResumeTrace (void) {
uint32_t index_i;
uint32_t index_o;
if (TraceStatus == (DAP_SWO_CAPTURE_ACTIVE | DAP_SWO_CAPTURE_PAUSED)) {
index_i = TraceIndexI;
index_o = TraceIndexO;
if ((index_i - index_o) < SWO_BUFFER_SIZE) {
index_i &= SWO_BUFFER_SIZE - 1U;
switch (TraceMode) {
#if (SWO_UART != 0)
case DAP_SWO_UART:
TraceStatus = DAP_SWO_CAPTURE_ACTIVE;
SWO_Capture_UART(&TraceBuf[index_i], 1U);
break;
#endif
#if (SWO_MANCHESTER != 0)
case DAP_SWO_MANCHESTER:
TraceStatus = DAP_SWO_CAPTURE_ACTIVE;
SWO_Capture_Manchester(&TraceBuf[index_i], 1U);
break;
#endif
default:
break;
}
}
}
}
// Get Trace Count
// return: number of available data bytes in trace buffer
static uint32_t GetTraceCount (void) {
uint32_t count;
if (TraceStatus == DAP_SWO_CAPTURE_ACTIVE) {
do {
TraceUpdate = 0U;
count = TraceIndexI - TraceIndexO;
switch (TraceMode) {
#if (SWO_UART != 0)
case DAP_SWO_UART:
count += SWO_GetCount_UART();
break;
#endif
#if (SWO_MANCHESTER != 0)
case DAP_SWO_MANCHESTER:
count += SWO_GetCount_Manchester();
break;
#endif
default:
break;
}
} while (TraceUpdate != 0U);
} else {
count = TraceIndexI - TraceIndexO;
}
return (count);
}
// Get Trace Status (clear Error flags)
// return: Trace Status (Active flag and Error flags)
static uint8_t GetTraceStatus (void) {
uint8_t status;
uint32_t n;
n = TraceError_n;
TraceError_n ^= 1U;
status = TraceStatus | TraceError[n];
TraceError[n] = 0U;
return (status);
}
// Set Trace Error flag(s)
// flag: error flag(s) to set
static void SetTraceError (uint8_t flag) {
TraceError[TraceError_n] |= flag;
}
// Process SWO Transport command and prepare response
// request: pointer to request data
// response: pointer to response data
// return: number of bytes in response (lower 16 bits)
// number of bytes in request (upper 16 bits)
uint32_t SWO_Transport (const uint8_t *request, uint8_t *response) {
uint8_t transport;
uint32_t result;
if ((TraceStatus & DAP_SWO_CAPTURE_ACTIVE) == 0U) {
transport = *request;
switch (transport) {
case 0U:
case 1U:
#if (SWO_STREAM != 0)
case 2U:
#endif
TraceTransport = transport;
result = 1U;
break;
default:
result = 0U;
break;
}
} else {
result = 0U;
}
if (result != 0U) {
*response = DAP_OK;
} else {
*response = DAP_ERROR;
}
return ((1U << 16) | 1U);
}
// Process SWO Mode command and prepare response
// request: pointer to request data
// response: pointer to response data
// return: number of bytes in response (lower 16 bits)
// number of bytes in request (upper 16 bits)
uint32_t SWO_Mode (const uint8_t *request, uint8_t *response) {
uint8_t mode;
uint32_t result;
mode = *request;
switch (TraceMode) {
#if (SWO_UART != 0)
case DAP_SWO_UART:
SWO_Mode_UART(0U);
break;
#endif
#if (SWO_MANCHESTER != 0)
case DAP_SWO_MANCHESTER:
SWO_Mode_Manchester(0U);
break;
#endif
default:
break;
}
switch (mode) {
case DAP_SWO_OFF:
result = 1U;
break;
#if (SWO_UART != 0)
case DAP_SWO_UART:
result = SWO_Mode_UART(1U);
break;
#endif
#if (SWO_MANCHESTER != 0)
case DAP_SWO_MANCHESTER:
result = SWO_Mode_Manchester(1U);
break;
#endif
default:
result = 0U;
break;
}
if (result != 0U) {
TraceMode = mode;
} else {
TraceMode = DAP_SWO_OFF;
}
TraceStatus = 0U;
if (result != 0U) {
*response = DAP_OK;
} else {
*response = DAP_ERROR;
}
return ((1U << 16) | 1U);
}
// Process SWO Baudrate command and prepare response
// request: pointer to request data
// response: pointer to response data
// return: number of bytes in response (lower 16 bits)
// number of bytes in request (upper 16 bits)
uint32_t SWO_Baudrate (const uint8_t *request, uint8_t *response) {
uint32_t baudrate;
baudrate = (uint32_t)(*(request+0) << 0) |
(uint32_t)(*(request+1) << 8) |
(uint32_t)(*(request+2) << 16) |
(uint32_t)(*(request+3) << 24);
switch (TraceMode) {
#if (SWO_UART != 0)
case DAP_SWO_UART:
baudrate = SWO_Baudrate_UART(baudrate);
break;
#endif
#if (SWO_MANCHESTER != 0)
case DAP_SWO_MANCHESTER:
baudrate = SWO_Baudrate_Manchester(baudrate);
break;
#endif
default:
baudrate = 0U;
break;
}
if (baudrate == 0U) {
TraceStatus = 0U;
}
*response++ = (uint8_t)(baudrate >> 0);
*response++ = (uint8_t)(baudrate >> 8);
*response++ = (uint8_t)(baudrate >> 16);
*response = (uint8_t)(baudrate >> 24);
return ((4U << 16) | 4U);
}
// Process SWO Control command and prepare response
// request: pointer to request data
// response: pointer to response data
// return: number of bytes in response (lower 16 bits)
// number of bytes in request (upper 16 bits)
uint32_t SWO_Control (const uint8_t *request, uint8_t *response) {
uint8_t active;
uint32_t result;
active = *request & DAP_SWO_CAPTURE_ACTIVE;
if (active != (TraceStatus & DAP_SWO_CAPTURE_ACTIVE)) {
if (active) {
ClearTrace();
}
switch (TraceMode) {
#if (SWO_UART != 0)
case DAP_SWO_UART:
result = SWO_Control_UART(active);
break;
#endif
#if (SWO_MANCHESTER != 0)
case DAP_SWO_MANCHESTER:
result = SWO_Control_Manchester(active);
break;
#endif
default:
result = 0U;
break;
}
if (result != 0U) {
TraceStatus = active;
#if (SWO_STREAM != 0)
if (TraceTransport == 2U) {
osThreadFlagsSet(SWO_ThreadId, 1U);
}
#endif
}
} else {
result = 1U;
}
if (result != 0U) {
*response = DAP_OK;
} else {
*response = DAP_ERROR;
}
return ((1U << 16) | 1U);
}
// Process SWO Status command and prepare response
// response: pointer to response data
// return: number of bytes in response
uint32_t SWO_Status (uint8_t *response) {
uint8_t status;
uint32_t count;
status = GetTraceStatus();
count = GetTraceCount();
*response++ = status;
*response++ = (uint8_t)(count >> 0);
*response++ = (uint8_t)(count >> 8);
*response++ = (uint8_t)(count >> 16);
*response = (uint8_t)(count >> 24);
return (5U);
}
// Process SWO Extended Status command and prepare response
// request: pointer to request data
// response: pointer to response data
// return: number of bytes in response (lower 16 bits)
// number of bytes in request (upper 16 bits)
uint32_t SWO_ExtendedStatus (const uint8_t *request, uint8_t *response) {
uint8_t cmd;
uint8_t status;
uint32_t count;
#if (TIMESTAMP_CLOCK != 0U)
uint32_t index;
uint32_t tick;
#endif
uint32_t num;
num = 0U;
cmd = *request;
if (cmd & 0x01U) {
status = GetTraceStatus();
*response++ = status;
num += 1U;
}
if (cmd & 0x02U) {
count = GetTraceCount();
*response++ = (uint8_t)(count >> 0);
*response++ = (uint8_t)(count >> 8);
*response++ = (uint8_t)(count >> 16);
*response++ = (uint8_t)(count >> 24);
num += 4U;
}
#if (TIMESTAMP_CLOCK != 0U)
if (cmd & 0x04U) {
do {
TraceUpdate = 0U;
index = TraceTimestamp.index;
tick = TraceTimestamp.tick;
} while (TraceUpdate != 0U);
*response++ = (uint8_t)(index >> 0);
*response++ = (uint8_t)(index >> 8);
*response++ = (uint8_t)(index >> 16);
*response++ = (uint8_t)(index >> 24);
*response++ = (uint8_t)(tick >> 0);
*response++ = (uint8_t)(tick >> 8);
*response++ = (uint8_t)(tick >> 16);
*response++ = (uint8_t)(tick >> 24);
num += 4U;
}
#endif
return ((1U << 16) | num);
}
// Process SWO Data command and prepare response
// request: pointer to request data
// response: pointer to response data
// return: number of bytes in response (lower 16 bits)
// number of bytes in request (upper 16 bits)
uint32_t SWO_Data (const uint8_t *request, uint8_t *response) {
uint8_t status;
uint32_t count;
uint32_t index;
uint32_t n, i;
status = GetTraceStatus();
count = GetTraceCount();
if (TraceTransport == 1U) {
n = (uint32_t)(*(request+0) << 0) |
(uint32_t)(*(request+1) << 8);
if (n > (DAP_PACKET_SIZE - 4U)) {
n = DAP_PACKET_SIZE - 4U;
}
if (count > n) {
count = n;
}
} else {
count = 0U;
}
*response++ = status;
*response++ = (uint8_t)(count >> 0);
*response++ = (uint8_t)(count >> 8);
if (TraceTransport == 1U) {
index = TraceIndexO;
for (i = index, n = count; n; n--) {
i &= SWO_BUFFER_SIZE - 1U;
*response++ = TraceBuf[i++];
}
TraceIndexO = index + count;
ResumeTrace();
}
return ((2U << 16) | (3U + count));
}
#if (SWO_STREAM != 0)
// SWO Data Transfer complete callback
void SWO_TransferComplete (void) {
TraceIndexO += TransferSize;
TransferBusy = 0U;
ResumeTrace();
osThreadFlagsSet(SWO_ThreadId, 1U);
}
// SWO Thread
__NO_RETURN void SWO_Thread (void *argument) {
uint32_t timeout;
uint32_t flags;
uint32_t count;
uint32_t index;
uint32_t i, n;
(void) argument;
timeout = osWaitForever;
for (;;) {
flags = osThreadFlagsWait(1U, osFlagsWaitAny, timeout);
if (TraceStatus & DAP_SWO_CAPTURE_ACTIVE) {
timeout = SWO_STREAM_TIMEOUT;
} else {
timeout = osWaitForever;
flags = osFlagsErrorTimeout;
}
if (TransferBusy == 0U) {
count = GetTraceCount();
if (count != 0U) {
index = TraceIndexO & (SWO_BUFFER_SIZE - 1U);
n = SWO_BUFFER_SIZE - index;
if (count > n) {
count = n;
}
if (flags != osFlagsErrorTimeout) {
i = index & (USB_BLOCK_SIZE - 1U);
if (i == 0U) {
count &= ~(USB_BLOCK_SIZE - 1U);
} else {
n = USB_BLOCK_SIZE - i;
if (count >= n) {
count = n;
} else {
count = 0U;
}
}
}
if (count != 0U) {
TransferSize = count;
TransferBusy = 1U;
SWO_QueueTransfer(&TraceBuf[index], count);
}
}
}
}
}
#endif /* (SWO_STREAM != 0) */
#endif /* ((SWO_UART != 0) || (SWO_MANCHESTER != 0)) */

View File

@ -1,286 +0,0 @@
/*
* Copyright (c) 2013-2017 ARM Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* ----------------------------------------------------------------------
*
* $Date: 1. December 2017
* $Revision: V2.0.0
*
* Project: CMSIS-DAP Source
* Title: SW_DP.c CMSIS-DAP SW DP I/O
*
*---------------------------------------------------------------------------*/
#include "DAP_config.h"
#include "DAP.h"
// SW Macros
#define PIN_SWCLK_SET PIN_SWCLK_TCK_SET
#define PIN_SWCLK_CLR PIN_SWCLK_TCK_CLR
#define SW_CLOCK_CYCLE() \
PIN_SWCLK_CLR(); \
PIN_DELAY(); \
PIN_SWCLK_SET(); \
PIN_DELAY()
#define SW_WRITE_BIT(bit) \
PIN_SWDIO_OUT(bit); \
PIN_SWCLK_CLR(); \
PIN_DELAY(); \
PIN_SWCLK_SET(); \
PIN_DELAY()
#define SW_READ_BIT(bit) \
PIN_SWCLK_CLR(); \
PIN_DELAY(); \
bit = PIN_SWDIO_IN(); \
PIN_SWCLK_SET(); \
PIN_DELAY()
#define PIN_DELAY() PIN_DELAY_SLOW(DAP_Data.clock_delay)
// Generate SWJ Sequence
// count: sequence bit count
// data: pointer to sequence bit data
// return: none
#if ((DAP_SWD != 0) || (DAP_JTAG != 0))
void SWJ_Sequence (uint32_t count, const uint8_t *data) {
uint32_t val;
uint32_t n;
val = 0U;
n = 0U;
while (count--) {
if (n == 0U) {
val = *data++;
n = 8U;
}
if (val & 1U) {
PIN_SWDIO_TMS_SET();
} else {
PIN_SWDIO_TMS_CLR();
}
SW_CLOCK_CYCLE();
val >>= 1;
n--;
}
}
#endif
// Generate SWD Sequence
// info: sequence information
// swdo: pointer to SWDIO generated data
// swdi: pointer to SWDIO captured data
// return: none
#if (DAP_SWD != 0)
void SWD_Sequence (uint32_t info, const uint8_t *swdo, uint8_t *swdi) {
uint32_t val;
uint32_t bit;
uint32_t n, k;
n = info & SWD_SEQUENCE_CLK;
if (n == 0U) {
n = 64U;
}
if (info & SWD_SEQUENCE_DIN) {
while (n) {
val = 0U;
for (k = 8U; k && n; k--, n--) {
SW_READ_BIT(bit);
val >>= 1;
val |= bit << 7;
}
val >>= k;
*swdi++ = (uint8_t)val;
}
} else {
while (n) {
val = *swdo++;
for (k = 8U; k && n; k--, n--) {
SW_WRITE_BIT(val);
val >>= 1;
}
}
}
}
#endif
#if (DAP_SWD != 0)
// SWD Transfer I/O
// request: A[3:2] RnW APnDP
// data: DATA[31:0]
// return: ACK[2:0]
#define SWD_TransferFunction(speed) /**/ \
static uint8_t SWD_Transfer##speed (uint32_t request, uint32_t *data) { \
uint32_t ack; \
uint32_t bit; \
uint32_t val; \
uint32_t parity; \
\
uint32_t n; \
\
/* Packet Request */ \
parity = 0U; \
SW_WRITE_BIT(1U); /* Start Bit */ \
bit = request >> 0; \
SW_WRITE_BIT(bit); /* APnDP Bit */ \
parity += bit; \
bit = request >> 1; \
SW_WRITE_BIT(bit); /* RnW Bit */ \
parity += bit; \
bit = request >> 2; \
SW_WRITE_BIT(bit); /* A2 Bit */ \
parity += bit; \
bit = request >> 3; \
SW_WRITE_BIT(bit); /* A3 Bit */ \
parity += bit; \
SW_WRITE_BIT(parity); /* Parity Bit */ \
SW_WRITE_BIT(0U); /* Stop Bit */ \
SW_WRITE_BIT(1U); /* Park Bit */ \
\
/* Turnaround */ \
PIN_SWDIO_OUT_DISABLE(); \
for (n = DAP_Data.swd_conf.turnaround; n; n--) { \
SW_CLOCK_CYCLE(); \
} \
\
/* Acknowledge response */ \
SW_READ_BIT(bit); \
ack = bit << 0; \
SW_READ_BIT(bit); \
ack |= bit << 1; \
SW_READ_BIT(bit); \
ack |= bit << 2; \
\
if (ack == DAP_TRANSFER_OK) { /* OK response */ \
/* Data transfer */ \
if (request & DAP_TRANSFER_RnW) { \
/* Read data */ \
val = 0U; \
parity = 0U; \
for (n = 32U; n; n--) { \
SW_READ_BIT(bit); /* Read RDATA[0:31] */ \
parity += bit; \
val >>= 1; \
val |= bit << 31; \
} \
SW_READ_BIT(bit); /* Read Parity */ \
if ((parity ^ bit) & 1U) { \
ack = DAP_TRANSFER_ERROR; \
} \
if (data) { *data = val; } \
/* Turnaround */ \
for (n = DAP_Data.swd_conf.turnaround; n; n--) { \
SW_CLOCK_CYCLE(); \
} \
PIN_SWDIO_OUT_ENABLE(); \
} else { \
/* Turnaround */ \
for (n = DAP_Data.swd_conf.turnaround; n; n--) { \
SW_CLOCK_CYCLE(); \
} \
PIN_SWDIO_OUT_ENABLE(); \
/* Write data */ \
val = *data; \
parity = 0U; \
for (n = 32U; n; n--) { \
SW_WRITE_BIT(val); /* Write WDATA[0:31] */ \
parity += val; \
val >>= 1; \
} \
SW_WRITE_BIT(parity); /* Write Parity Bit */ \
} \
/* Capture Timestamp */ \
if (request & DAP_TRANSFER_TIMESTAMP) { \
DAP_Data.timestamp = TIMESTAMP_GET(); \
} \
/* Idle cycles */ \
n = DAP_Data.transfer.idle_cycles; \
if (n) { \
PIN_SWDIO_OUT(0U); \
for (; n; n--) { \
SW_CLOCK_CYCLE(); \
} \
} \
PIN_SWDIO_OUT(1U); \
return ((uint8_t)ack); \
} \
\
if ((ack == DAP_TRANSFER_WAIT) || (ack == DAP_TRANSFER_FAULT)) { \
/* WAIT or FAULT response */ \
if (DAP_Data.swd_conf.data_phase && ((request & DAP_TRANSFER_RnW) != 0U)) { \
for (n = 32U+1U; n; n--) { \
SW_CLOCK_CYCLE(); /* Dummy Read RDATA[0:31] + Parity */ \
} \
} \
/* Turnaround */ \
for (n = DAP_Data.swd_conf.turnaround; n; n--) { \
SW_CLOCK_CYCLE(); \
} \
PIN_SWDIO_OUT_ENABLE(); \
if (DAP_Data.swd_conf.data_phase && ((request & DAP_TRANSFER_RnW) == 0U)) { \
PIN_SWDIO_OUT(0U); \
for (n = 32U+1U; n; n--) { \
SW_CLOCK_CYCLE(); /* Dummy Write WDATA[0:31] + Parity */ \
} \
} \
PIN_SWDIO_OUT(1U); \
return ((uint8_t)ack); \
} \
\
/* Protocol error */ \
for (n = DAP_Data.swd_conf.turnaround + 32U + 1U; n; n--) { \
SW_CLOCK_CYCLE(); /* Back off data phase */ \
} \
PIN_SWDIO_OUT_ENABLE(); \
PIN_SWDIO_OUT(1U); \
return ((uint8_t)ack); \
}
#undef PIN_DELAY
#define PIN_DELAY() PIN_DELAY_FAST()
SWD_TransferFunction(Fast)
#undef PIN_DELAY
#define PIN_DELAY() PIN_DELAY_SLOW(DAP_Data.clock_delay)
SWD_TransferFunction(Slow)
// SWD Transfer I/O
// request: A[3:2] RnW APnDP
// data: DATA[31:0]
// return: ACK[2:0]
uint8_t SWD_Transfer(uint32_t request, uint32_t *data) {
if (DAP_Data.fast_clock) {
return SWD_TransferFast(request, data);
} else {
return SWD_TransferSlow(request, data);
}
}
#endif /* (DAP_SWD != 0) */

View File

@ -1,652 +0,0 @@
/*
* Copyright (c) 2021 ARM Limited. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* ----------------------------------------------------------------------
*
* $Date: 1. March 2021
* $Revision: V1.0.0
*
* Project: CMSIS-DAP Source
* Title: UART.c CMSIS-DAP UART
*
*---------------------------------------------------------------------------*/
#include "DAP_config.h"
#include "DAP.h"
#if (DAP_UART != 0)
#ifdef DAP_FW_V1
#error "UART Communication Port not supported in DAP V1!"
#endif
#include "Driver_USART.h"
#include "cmsis_os2.h"
#include <string.h>
#define UART_RX_BLOCK_SIZE 32U /* Uart Rx Block Size (must be 2^n) */
// USART Driver
#define _USART_Driver_(n) Driver_USART##n
#define USART_Driver_(n) _USART_Driver_(n)
extern ARM_DRIVER_USART USART_Driver_(DAP_UART_DRIVER);
#define pUSART (&USART_Driver_(DAP_UART_DRIVER))
// UART Configuration
#if (DAP_UART_USB_COM_PORT != 0)
static uint8_t UartTransport = DAP_UART_TRANSPORT_USB_COM_PORT;
#else
static uint8_t UartTransport = DAP_UART_TRANSPORT_NONE;
#endif
// UART Flags
static uint8_t UartConfigured = 0U;
static uint8_t UartReceiveEnabled = 0U;
static uint8_t UartTransmitEnabled = 0U;
static uint8_t UartTransmitActive = 0U;
// UART TX Buffer
static uint8_t UartTxBuf[DAP_UART_TX_BUFFER_SIZE];
static volatile uint32_t UartTxIndexI = 0U;
static volatile uint32_t UartTxIndexO = 0U;
// UART RX Buffer
static uint8_t UartRxBuf[DAP_UART_RX_BUFFER_SIZE];
static volatile uint32_t UartRxIndexI = 0U;
static volatile uint32_t UartRxIndexO = 0U;
// Uart Errors
static volatile uint8_t UartErrorRxDataLost = 0U;
static volatile uint8_t UartErrorFraming = 0U;
static volatile uint8_t UartErrorParity = 0U;
// UART Transmit
static uint32_t UartTxNum = 0U;
// Function prototypes
static uint8_t UART_Init (void);
static void UART_Uninit (void);
static uint8_t UART_Get_Status (void);
static uint8_t UART_Receive_Enable (void);
static uint8_t UART_Transmit_Enable (void);
static void UART_Receive_Disable (void);
static void UART_Transmit_Disable (void);
static void UART_Receive_Flush (void);
static void UART_Transmit_Flush (void);
static void UART_Receive (void);
static void UART_Transmit (void);
// USART Driver Callback function
// event: event mask
static void USART_Callback (uint32_t event) {
if (event & ARM_USART_EVENT_SEND_COMPLETE) {
UartTxIndexO += UartTxNum;
UartTransmitActive = 0U;
UART_Transmit();
}
if (event & ARM_USART_EVENT_RECEIVE_COMPLETE) {
UartRxIndexI += UART_RX_BLOCK_SIZE;
UART_Receive();
}
if (event & ARM_USART_EVENT_RX_OVERFLOW) {
UartErrorRxDataLost = 1U;
}
if (event & ARM_USART_EVENT_RX_FRAMING_ERROR) {
UartErrorFraming = 1U;
}
if (event & ARM_USART_EVENT_RX_PARITY_ERROR) {
UartErrorParity = 1U;
}
}
// Init UART
// return: DAP_OK or DAP_ERROR
static uint8_t UART_Init (void) {
int32_t status;
uint8_t ret = DAP_ERROR;
UartConfigured = 0U;
UartReceiveEnabled = 0U;
UartTransmitEnabled = 0U;
UartTransmitActive = 0U;
UartErrorRxDataLost = 0U;
UartErrorFraming = 0U;
UartErrorParity = 0U;
UartTxIndexI = 0U;
UartTxIndexO = 0U;
UartRxIndexI = 0U;
UartRxIndexO = 0U;
UartTxNum = 0U;
status = pUSART->Initialize(USART_Callback);
if (status == ARM_DRIVER_OK) {
status = pUSART->PowerControl(ARM_POWER_FULL);
}
if (status == ARM_DRIVER_OK) {
ret = DAP_OK;
}
return (ret);
}
// Un-Init UART
static void UART_Uninit (void) {
UartConfigured = 0U;
pUSART->PowerControl(ARM_POWER_OFF);
pUSART->Uninitialize();
}
// Get UART Status
// return: status
static uint8_t UART_Get_Status (void) {
uint8_t status = 0U;
if (UartReceiveEnabled != 0U) {
status |= DAP_UART_STATUS_RX_ENABLED;
}
if (UartErrorRxDataLost != 0U) {
UartErrorRxDataLost = 0U;
status |= DAP_UART_STATUS_RX_DATA_LOST;
}
if (UartErrorFraming != 0U) {
UartErrorFraming = 0U;
status |= DAP_UART_STATUS_FRAMING_ERROR;
}
if (UartErrorParity != 0U) {
UartErrorParity = 0U;
status |= DAP_UART_STATUS_PARITY_ERROR;
}
if (UartTransmitEnabled != 0U) {
status |= DAP_UART_STATUS_TX_ENABLED;
}
return (status);
}
// Enable UART Receive
// return: DAP_OK or DAP_ERROR
static uint8_t UART_Receive_Enable (void) {
int32_t status;
uint8_t ret = DAP_ERROR;
if (UartReceiveEnabled == 0U) {
// Flush Buffers
UartRxIndexI = 0U;
UartRxIndexO = 0U;
UART_Receive();
status = pUSART->Control(ARM_USART_CONTROL_RX, 1U);
if (status == ARM_DRIVER_OK) {
UartReceiveEnabled = 1U;
ret = DAP_OK;
}
} else {
ret = DAP_OK;
}
return (ret);
}
// Enable UART Transmit
// return: DAP_OK or DAP_ERROR
static uint8_t UART_Transmit_Enable (void) {
int32_t status;
uint8_t ret = DAP_ERROR;
if (UartTransmitEnabled == 0U) {
// Flush Buffers
UartTransmitActive = 0U;
UartTxIndexI = 0U;
UartTxIndexO = 0U;
UartTxNum = 0U;
status = pUSART->Control(ARM_USART_CONTROL_TX, 1U);
if (status == ARM_DRIVER_OK) {
UartTransmitEnabled = 1U;
ret = DAP_OK;
}
} else {
ret = DAP_OK;
}
return (ret);
}
// Disable UART Receive
static void UART_Receive_Disable (void) {
if (UartReceiveEnabled != 0U) {
pUSART->Control(ARM_USART_CONTROL_RX, 0U);
pUSART->Control(ARM_USART_ABORT_RECEIVE, 0U);
UartReceiveEnabled = 0U;
}
}
// Disable UART Transmit
static void UART_Transmit_Disable (void) {
if (UartTransmitEnabled != 0U) {
pUSART->Control(ARM_USART_ABORT_SEND, 0U);
pUSART->Control(ARM_USART_CONTROL_TX, 0U);
UartTransmitActive = 0U;
UartTransmitEnabled = 0U;
}
}
// Flush UART Receive buffer
static void UART_Receive_Flush (void) {
pUSART->Control(ARM_USART_ABORT_RECEIVE, 0U);
UartRxIndexI = 0U;
UartRxIndexO = 0U;
if (UartReceiveEnabled != 0U) {
UART_Receive();
}
}
// Flush UART Transmit buffer
static void UART_Transmit_Flush (void) {
pUSART->Control(ARM_USART_ABORT_SEND, 0U);
UartTransmitActive = 0U;
UartTxIndexI = 0U;
UartTxIndexO = 0U;
UartTxNum = 0U;
}
// Receive data from target via UART
static void UART_Receive (void) {
uint32_t index;
index = UartRxIndexI & (DAP_UART_RX_BUFFER_SIZE - 1U);
pUSART->Receive(&UartRxBuf[index], UART_RX_BLOCK_SIZE);
}
// Transmit available data to target via UART
static void UART_Transmit (void) {
uint32_t count;
uint32_t index;
count = UartTxIndexI - UartTxIndexO;
index = UartTxIndexO & (DAP_UART_TX_BUFFER_SIZE - 1U);
if (count != 0U) {
if ((index + count) <= DAP_UART_TX_BUFFER_SIZE) {
UartTxNum = count;
} else {
UartTxNum = DAP_UART_TX_BUFFER_SIZE - index;
}
UartTransmitActive = 1U;
pUSART->Send(&UartTxBuf[index], UartTxNum);
}
}
// Process UART Transport command and prepare response
// request: pointer to request data
// response: pointer to response data
// return: number of bytes in response (lower 16 bits)
// number of bytes in request (upper 16 bits)
uint32_t UART_Transport (const uint8_t *request, uint8_t *response) {
uint8_t transport;
uint8_t ret = DAP_ERROR;
transport = *request;
switch (transport) {
case DAP_UART_TRANSPORT_NONE:
switch (UartTransport) {
case DAP_UART_TRANSPORT_NONE:
ret = DAP_OK;
break;
case DAP_UART_TRANSPORT_USB_COM_PORT:
#if (DAP_UART_USB_COM_PORT != 0)
USB_COM_PORT_Activate(0U);
UartTransport = DAP_UART_TRANSPORT_NONE;
ret = DAP_OK;
#endif
break;
case DAP_UART_TRANSPORT_DAP_COMMAND:
UART_Receive_Disable();
UART_Transmit_Disable();
UART_Uninit();
UartTransport = DAP_UART_TRANSPORT_NONE;
ret= DAP_OK;
break;
}
break;
case DAP_UART_TRANSPORT_USB_COM_PORT:
switch (UartTransport) {
case DAP_UART_TRANSPORT_NONE:
#if (DAP_UART_USB_COM_PORT != 0)
if (USB_COM_PORT_Activate(1U) == 0U) {
UartTransport = DAP_UART_TRANSPORT_USB_COM_PORT;
ret = DAP_OK;
}
#endif
break;
case DAP_UART_TRANSPORT_USB_COM_PORT:
ret = DAP_OK;
break;
case DAP_UART_TRANSPORT_DAP_COMMAND:
UART_Receive_Disable();
UART_Transmit_Disable();
UART_Uninit();
UartTransport = DAP_UART_TRANSPORT_NONE;
#if (DAP_UART_USB_COM_PORT != 0)
if (USB_COM_PORT_Activate(1U) == 0U) {
UartTransport = DAP_UART_TRANSPORT_USB_COM_PORT;
ret = DAP_OK;
}
#endif
break;
}
break;
case DAP_UART_TRANSPORT_DAP_COMMAND:
switch (UartTransport) {
case DAP_UART_TRANSPORT_NONE:
ret = UART_Init();
if (ret == DAP_OK) {
UartTransport = DAP_UART_TRANSPORT_DAP_COMMAND;
}
break;
case DAP_UART_TRANSPORT_USB_COM_PORT:
#if (DAP_UART_USB_COM_PORT != 0)
USB_COM_PORT_Activate(0U);
UartTransport = DAP_UART_TRANSPORT_NONE;
#endif
ret = UART_Init();
if (ret == DAP_OK) {
UartTransport = DAP_UART_TRANSPORT_DAP_COMMAND;
}
break;
case DAP_UART_TRANSPORT_DAP_COMMAND:
ret = DAP_OK;
break;
}
break;
default:
break;
}
*response = ret;
return ((1U << 16) | 1U);
}
// Process UART Configure command and prepare response
// request: pointer to request data
// response: pointer to response data
// return: number of bytes in response (lower 16 bits)
// number of bytes in request (upper 16 bits)
uint32_t UART_Configure (const uint8_t *request, uint8_t *response) {
uint8_t control, status;
uint32_t baudrate;
int32_t result;
if (UartTransport != DAP_UART_TRANSPORT_DAP_COMMAND) {
status = DAP_UART_CFG_ERROR_DATA_BITS |
DAP_UART_CFG_ERROR_PARITY |
DAP_UART_CFG_ERROR_STOP_BITS;
baudrate = 0U; // baudrate error
} else {
status = 0U;
control = *request;
baudrate = (uint32_t)(*(request+1) << 0) |
(uint32_t)(*(request+2) << 8) |
(uint32_t)(*(request+3) << 16) |
(uint32_t)(*(request+4) << 24);
result = pUSART->Control(control |
ARM_USART_MODE_ASYNCHRONOUS |
ARM_USART_FLOW_CONTROL_NONE,
baudrate);
if (result == ARM_DRIVER_OK) {
UartConfigured = 1U;
} else {
UartConfigured = 0U;
switch (result) {
case ARM_USART_ERROR_BAUDRATE:
status = 0U;
baudrate = 0U;
break;
case ARM_USART_ERROR_DATA_BITS:
status = DAP_UART_CFG_ERROR_DATA_BITS;
break;
case ARM_USART_ERROR_PARITY:
status = DAP_UART_CFG_ERROR_PARITY;
break;
case ARM_USART_ERROR_STOP_BITS:
status = DAP_UART_CFG_ERROR_STOP_BITS;
break;
default:
status = DAP_UART_CFG_ERROR_DATA_BITS |
DAP_UART_CFG_ERROR_PARITY |
DAP_UART_CFG_ERROR_STOP_BITS;
baudrate = 0U;
break;
}
}
}
*response++ = status;
*response++ = (uint8_t)(baudrate >> 0);
*response++ = (uint8_t)(baudrate >> 8);
*response++ = (uint8_t)(baudrate >> 16);
*response = (uint8_t)(baudrate >> 24);
return ((5U << 16) | 5U);
}
// Process UART Control command and prepare response
// request: pointer to request data
// response: pointer to response data
// return: number of bytes in response (lower 16 bits)
// number of bytes in request (upper 16 bits)
uint32_t UART_Control (const uint8_t *request, uint8_t *response) {
uint8_t control;
uint8_t result;
uint8_t ret = DAP_OK;
if (UartTransport != DAP_UART_TRANSPORT_DAP_COMMAND) {
ret = DAP_ERROR;
} else {
control = *request;
if ((control & DAP_UART_CONTROL_RX_DISABLE) != 0U) {
// Receive disable
UART_Receive_Disable();
} else if ((control & DAP_UART_CONTROL_RX_ENABLE) != 0U) {
// Receive enable
if (UartConfigured != 0U) {
result = UART_Receive_Enable();
if (result != DAP_OK) {
ret = DAP_ERROR;
}
} else {
ret = DAP_ERROR;
}
}
if ((control & DAP_UART_CONTROL_RX_BUF_FLUSH) != 0U) {
UART_Receive_Flush();
}
if ((control & DAP_UART_CONTROL_TX_DISABLE) != 0U) {
// Transmit disable
UART_Transmit_Disable();
} else if ((control & DAP_UART_CONTROL_TX_ENABLE) != 0U) {
// Transmit enable
if (UartConfigured != 0U) {
result = UART_Transmit_Enable();
if (result != DAP_OK) {
ret = DAP_ERROR;
}
} else {
ret = DAP_ERROR;
}
}
if ((control & DAP_UART_CONTROL_TX_BUF_FLUSH) != 0U) {
UART_Transmit_Flush();
}
}
*response = ret;
return ((1U << 16) | 1U);
}
// Process UART Status command and prepare response
// response: pointer to response data
// return: number of bytes in response (lower 16 bits)
// number of bytes in request (upper 16 bits)
uint32_t UART_Status (uint8_t *response) {
uint32_t rx_cnt, tx_cnt;
uint32_t cnt;
uint8_t status;
if ((UartTransport != DAP_UART_TRANSPORT_DAP_COMMAND) ||
(UartConfigured == 0U)) {
rx_cnt = 0U;
tx_cnt = 0U;
status = 0U;
} else {
rx_cnt = UartRxIndexI - UartRxIndexO;
rx_cnt += pUSART->GetRxCount();
if (rx_cnt > (DAP_UART_RX_BUFFER_SIZE - (UART_RX_BLOCK_SIZE*2))) {
// Overflow
UartErrorRxDataLost = 1U;
rx_cnt = (DAP_UART_RX_BUFFER_SIZE - (UART_RX_BLOCK_SIZE*2));
UartRxIndexO = UartRxIndexI - rx_cnt;
}
tx_cnt = UartTxIndexI - UartTxIndexO;
cnt = pUSART->GetTxCount();
if (UartTransmitActive != 0U) {
tx_cnt -= cnt;
}
status = UART_Get_Status();
}
*response++ = status;
*response++ = (uint8_t)(rx_cnt >> 0);
*response++ = (uint8_t)(rx_cnt >> 8);
*response++ = (uint8_t)(rx_cnt >> 16);
*response++ = (uint8_t)(rx_cnt >> 24);
*response++ = (uint8_t)(tx_cnt >> 0);
*response++ = (uint8_t)(tx_cnt >> 8);
*response++ = (uint8_t)(tx_cnt >> 16);
*response = (uint8_t)(tx_cnt >> 24);
return ((0U << 16) | 9U);
}
// Process UART Transfer command and prepare response
// request: pointer to request data
// response: pointer to response data
// return: number of bytes in response (lower 16 bits)
// number of bytes in request (upper 16 bits)
uint32_t UART_Transfer (const uint8_t *request, uint8_t *response) {
uint32_t rx_cnt, tx_cnt;
uint32_t rx_num, tx_num;
uint8_t *rx_data;
const
uint8_t *tx_data;
uint32_t num;
uint32_t index;
uint8_t status;
if (UartTransport != DAP_UART_TRANSPORT_DAP_COMMAND) {
status = 0U;
rx_cnt = 0U;
tx_cnt = 0U;
} else {
// RX Data
rx_cnt = ((uint32_t)(*(request+0) << 0) |
(uint32_t)(*(request+1) << 8));
if (rx_cnt > (DAP_PACKET_SIZE - 6U)) {
rx_cnt = (DAP_PACKET_SIZE - 6U);
}
rx_num = UartRxIndexI - UartRxIndexO;
rx_num += pUSART->GetRxCount();
if (rx_num > (DAP_UART_RX_BUFFER_SIZE - (UART_RX_BLOCK_SIZE*2))) {
// Overflow
UartErrorRxDataLost = 1U;
rx_num = (DAP_UART_RX_BUFFER_SIZE - (UART_RX_BLOCK_SIZE*2));
UartRxIndexO = UartRxIndexI - rx_num;
}
if (rx_cnt > rx_num) {
rx_cnt = rx_num;
}
rx_data = (response+5);
index = UartRxIndexO & (DAP_UART_RX_BUFFER_SIZE - 1U);
if ((index + rx_cnt) <= DAP_UART_RX_BUFFER_SIZE) {
memcpy( rx_data, &UartRxBuf[index], rx_cnt);
} else {
num = DAP_UART_RX_BUFFER_SIZE - index;
memcpy( rx_data, &UartRxBuf[index], num);
memcpy(&rx_data[num], &UartRxBuf[0], rx_cnt - num);
}
UartRxIndexO += rx_cnt;
// TX Data
tx_cnt = ((uint32_t)(*(request+2) << 0) |
(uint32_t)(*(request+3) << 8));
tx_data = (request+4);
if (tx_cnt > (DAP_PACKET_SIZE - 5U)) {
tx_cnt = (DAP_PACKET_SIZE - 5U);
}
tx_num = UartTxIndexI - UartTxIndexO;
num = pUSART->GetTxCount();
if (UartTransmitActive != 0U) {
tx_num -= num;
}
if (tx_cnt > (DAP_UART_TX_BUFFER_SIZE - tx_num)) {
tx_cnt = (DAP_UART_TX_BUFFER_SIZE - tx_num);
}
index = UartTxIndexI & (DAP_UART_TX_BUFFER_SIZE - 1U);
if ((index + tx_cnt) <= DAP_UART_TX_BUFFER_SIZE) {
memcpy(&UartTxBuf[index], tx_data, tx_cnt);
} else {
num = DAP_UART_TX_BUFFER_SIZE - index;
memcpy(&UartTxBuf[index], tx_data, num);
memcpy(&UartTxBuf[0], &tx_data[num], tx_cnt - num);
}
UartTxIndexI += tx_cnt;
if (UartTransmitActive == 0U) {
UART_Transmit();
}
status = UART_Get_Status();
}
*response++ = status;
*response++ = (uint8_t)(tx_cnt >> 0);
*response++ = (uint8_t)(tx_cnt >> 8);
*response++ = (uint8_t)(rx_cnt >> 0);
*response = (uint8_t)(rx_cnt >> 8);
return (((4U + tx_cnt) << 16) | (5U + rx_cnt));
}
#endif /* DAP_UART */

View File

@ -1,29 +0,0 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
cmake_minimum_required(VERSION 3.15.6)
project(CMSISNN)
set(CMSIS_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../..")
option(BUILD_CMSIS_NN_FUNCTIONS "Build CMSIS-NN Source." ON)
if(BUILD_CMSIS_NN_FUNCTIONS)
add_subdirectory(Source)
endif()

View File

@ -1,169 +0,0 @@
/******************************************************************************
* @file arm_nn_math_types.h
* @brief Compiler include and basic types
* @version V1.1.0
* @date 09 March 2022
* Target Processor: Cortex-M
******************************************************************************/
/*
* Copyright (c) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
Copied from CMSIS/DSP/arm_math_types.h and modified
*/
#ifndef _ARM_NN_MATH_TYPES_H_
#define _ARM_NN_MATH_TYPES_H_
/* DSP inlcude for enum arm_status. */
#include "arm_math_types.h"
#ifdef __cplusplus
extern "C" {
#endif
/* Compiler specific diagnostic adjustment */
#if defined(__CC_ARM)
#elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
#elif defined(__GNUC__)
#elif defined(__ICCARM__)
#elif defined(__TI_ARM__)
#elif defined(__CSMC__)
#elif defined(__TASKING__)
#elif defined(_MSC_VER)
#else
#error Unknown compiler
#endif
/* Included for instrinsics definitions */
#if defined(_MSC_VER)
#include <stdint.h>
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE static __forceinline
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static __inline
#endif
#ifndef __ALIGNED
#define __ALIGNED(x) __declspec(align(x))
#endif
#elif defined(__GNUC_PYTHON__)
#include <stdint.h>
#ifndef __ALIGNED
#define __ALIGNED(x) __attribute__((aligned(x)))
#endif
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE static inline __attribute__((always_inline))
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static inline
#endif
#else
#include "cmsis_compiler.h"
#endif
#include <float.h>
#include <limits.h>
#include <math.h>
#include <string.h>
/* evaluate ARM DSP feature */
#if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
#ifndef ARM_MATH_DSP
#define ARM_MATH_DSP 1
#endif
#endif
#if __ARM_FEATURE_MVE
#ifndef ARM_MATH_MVEI
#define ARM_MATH_MVEI
#endif
#endif
/* Compiler specific diagnostic adjustment */
#if defined(__CC_ARM)
#elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
#elif defined(__GNUC__)
// #pragma GCC diagnostic pop
#elif defined(__ICCARM__)
#elif defined(__TI_ARM__)
#elif defined(__CSMC__)
#elif defined(__TASKING__)
#elif defined(_MSC_VER)
#else
#error Unknown compiler
#endif
#ifdef __cplusplus
}
#endif
#if __ARM_FEATURE_MVE
#include <arm_mve.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief Add necessary typedefs
*/
#define NN_Q31_MAX ((q31_t)(0x7FFFFFFFL))
#define NN_Q15_MAX ((q15_t)(0x7FFF))
#define NN_Q7_MAX ((q7_t)(0x7F))
#define NN_Q31_MIN ((q31_t)(0x80000000L))
#define NN_Q15_MIN ((q15_t)(0x8000))
#define NN_Q7_MIN ((q7_t)(0x80))
/**
* @brief Error status returned by some functions in the library.
*/
typedef enum
{
ARM_CMSIS_NN_SUCCESS = 0, /**< No error */
ARM_CMSIS_NN_ARG_ERROR = -1, /**< One or more arguments are incorrect */
ARM_CMSIS_NN_NO_IMPL_ERROR = -2, /**< No implementation available */
} arm_cmsis_nn_status;
#ifdef __cplusplus
}
#endif
#endif /*ifndef _ARM_NN_MATH_TYPES_H_ */

View File

@ -1,56 +0,0 @@
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_tables.h
* Description: Extern declaration for NN tables
*
* $Date: 17. August 2021
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _ARM_NN_TABLES_H
#define _ARM_NN_TABLES_H
#include "arm_nn_math_types.h"
/**
* @brief tables for various activation functions
*
*/
extern const q15_t sigmoidTable_q15[256];
extern const q7_t sigmoidTable_q7[256];
extern const q7_t tanhTable_q7[256];
extern const q15_t tanhTable_q15[256];
/**
* @brief 2-way tables for various activation functions
*
* 2-way table, H table for value larger than 1/4
* L table for value smaller than 1/4, H table for remaining
* We have this only for the q15_t version. It does not make
* sense to have it for q7_t type
*/
extern const q15_t sigmoidHTable_q15[192];
extern const q15_t sigmoidLTable_q15[128];
#endif /* ARM_NN_TABLES_H */

View File

@ -1,137 +0,0 @@
/*
* Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_types.h
* Description: Public header file to contain the CMSIS-NN structs for the
* TensorFlowLite micro compliant functions
*
* $Date: 22. Februari 2022
* $Revision: V.2.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#ifndef _ARM_NN_TYPES_H
#define _ARM_NN_TYPES_H
#include <stdint.h>
/** CMSIS-NN object to contain the width and height of a tile */
typedef struct
{
int32_t w; /**< Width */
int32_t h; /**< Height */
} cmsis_nn_tile;
/** CMSIS-NN object used for the function context. */
typedef struct
{
void *buf; /**< Pointer to a buffer needed for the optimization */
int32_t size; /**< Buffer size */
} cmsis_nn_context;
/** CMSIS-NN object to contain the dimensions of the tensors */
typedef struct
{
int32_t n; /**< Generic dimension to contain either the batch size or output channels.
Please refer to the function documentation for more information */
int32_t h; /**< Height */
int32_t w; /**< Width */
int32_t c; /**< Input channels */
} cmsis_nn_dims;
/** CMSIS-NN object for the per-channel quantization parameters */
typedef struct
{
int32_t *multiplier; /**< Multiplier values */
int32_t *shift; /**< Shift values */
} cmsis_nn_per_channel_quant_params;
/** CMSIS-NN object for the per-tensor quantization parameters */
typedef struct
{
int32_t multiplier; /**< Multiplier value */
int32_t shift; /**< Shift value */
} cmsis_nn_per_tensor_quant_params;
/** CMSIS-NN object for the quantized Relu activation */
typedef struct
{
int32_t min; /**< Min value used to clamp the result */
int32_t max; /**< Max value used to clamp the result */
} cmsis_nn_activation;
/** CMSIS-NN object for the convolution layer parameters */
typedef struct
{
int32_t input_offset; /**< Zero value for the input tensor */
int32_t output_offset; /**< Zero value for the output tensor */
cmsis_nn_tile stride;
cmsis_nn_tile padding;
cmsis_nn_tile dilation;
cmsis_nn_activation activation;
} cmsis_nn_conv_params;
/** CMSIS-NN object for Depthwise convolution layer parameters */
typedef struct
{
int32_t input_offset; /**< Zero value for the input tensor */
int32_t output_offset; /**< Zero value for the output tensor */
int32_t ch_mult; /**< Channel Multiplier. ch_mult * in_ch = out_ch */
cmsis_nn_tile stride;
cmsis_nn_tile padding;
cmsis_nn_tile dilation;
cmsis_nn_activation activation;
} cmsis_nn_dw_conv_params;
/** CMSIS-NN object for pooling layer parameters */
typedef struct
{
cmsis_nn_tile stride;
cmsis_nn_tile padding;
cmsis_nn_activation activation;
} cmsis_nn_pool_params;
/** CMSIS-NN object for Fully Connected layer parameters */
typedef struct
{
int32_t input_offset; /**< Zero value for the input tensor */
int32_t filter_offset; /**< Zero value for the filter tensor. Not used */
int32_t output_offset; /**< Zero value for the output tensor */
cmsis_nn_activation activation;
} cmsis_nn_fc_params;
/** CMSIS-NN object for SVDF layer parameters */
typedef struct
{
int32_t rank;
int32_t input_offset; /**< Zero value for the input tensor */
int32_t output_offset; /**< Zero value for the output tensor */
cmsis_nn_activation input_activation;
cmsis_nn_activation output_activation;
} cmsis_nn_svdf_params;
/** CMSIS-NN object for Softmax s16 layer parameters */
typedef struct
{
const int16_t *exp_lut;
const int16_t *one_by_one_lut;
} cmsis_nn_softmax_lut_s16;
#endif // _ARM_NN_TYPES_H

View File

@ -1,30 +0,0 @@
#
# Copyright (c) 2019-2021 Arm Limited. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
project(CMSISNNActivation)
file(GLOB SRC "./*_s8.c")
add_library(CMSISNNActivation STATIC ${SRC})
### Includes
target_include_directories(CMSISNNActivation PUBLIC "${NN}/Include")
target_include_directories(CMSISNNActivation PUBLIC "${ROOT}/CMSIS/Core/Include")
target_include_directories(CMSISNNActivation PUBLIC "${ROOT}/CMSIS/DSP/Include")

View File

@ -1,96 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_activations_q15.c
* Description: Q15 neural network activation function using direct table look-up
*
* $Date: 09. October 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nn_tables.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief neural network activation function using direct table look-up
*
* @note Refer header file for details.
*
*/
void arm_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type)
{
uint16_t i = size;
q15_t *pIn = data;
q15_t *pOut = data;
uint16_t shift_size = 8 + 3 - int_width;
uint32_t bit_mask = 0x7FF >> int_width;
uint32_t full_frac = bit_mask + 1;
const q15_t *lookup_table;
switch (type)
{
case ARM_SIGMOID:
lookup_table = sigmoidTable_q15;
break;
case ARM_TANH:
default:
lookup_table = tanhTable_q15;
break;
}
while (i)
{
q15_t out;
q15_t in = *pIn++;
q15_t frac = (uint32_t)in & bit_mask;
q15_t value = lookup_table[(uint8_t)(in >> shift_size)];
if ((in >> shift_size) != 0x7f)
{
q15_t value2 = lookup_table[(uint8_t)(1 + ((uint8_t)(in >> shift_size)))];
/* doing the interpolation here for better accuracy */
out = ((q31_t)(full_frac - frac) * value + (q31_t)value2 * frac) >> shift_size;
}
else
{
/* the largest positive value does not have a right side for linear interpolation */
out = value;
}
*pOut++ = out;
i--;
}
}
/**
* @} end of Acti group
*/

View File

@ -1,89 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_activations_q7.c
* Description: Q7 neural network activation function using direct table look-up
*
* $Date: 09. October 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nn_tables.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief Q7 neural network activation function using direct table look-up
* @param[in,out] data pointer to input
* @param[in] size number of elements
* @param[in] int_width bit-width of the integer part, assume to be smaller than 3
* @param[in] type type of activation functions
*
* @details
*
* This is the direct table look-up approach.
*
* Assume here the integer part of the fixed-point is <= 3.
* More than 3 just not making much sense, makes no difference with
* saturation followed by any of these activation functions.
*/
void arm_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type)
{
uint16_t i = size;
q7_t *pIn = data;
q7_t *pOut = data;
q7_t in;
q7_t out;
uint16_t shift_size = 3 - int_width;
const q7_t *lookup_table;
switch (type)
{
case ARM_SIGMOID:
lookup_table = sigmoidTable_q7;
break;
case ARM_TANH:
default:
lookup_table = tanhTable_q7;
break;
}
while (i)
{
in = *pIn++;
out = lookup_table[(uint8_t)(in >> shift_size)];
*pOut++ = out;
i--;
}
}
/**
* @} end of Acti group
*/

View File

@ -1,65 +0,0 @@
/*
* Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_relu6_s8.c
* Description: Basic s8 version of ReLU6
*
* $Date: 09. October 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/*
* Basic ReLU6 function
*
* Refer to header file for details.
*
*/
void arm_relu6_s8(q7_t *data, uint16_t size)
{
int32_t i;
for (i = 0; i < size; i++)
{
int32_t ip = data[i];
ip = MAX(ip, 0);
data[i] = MIN(ip, 6);
}
}
/**
* @} end of Acti group
*/

View File

@ -1,104 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_relu_q15.c
* Description: Q15 version of ReLU
*
* $Date: 09. October 2020
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief Q15 RELU function
* @param[in,out] data pointer to input
* @param[in] size number of elements
*
* @details
*
* Optimized relu with QSUB instructions.
*
*/
void arm_relu_q15(q15_t *data, uint16_t size)
{
#if defined(ARM_MATH_DSP)
/* Run the following code for M cores with DSP extension */
uint16_t i = size >> 1;
q15_t *input = data;
q15_t *output = data;
q31_t in;
q31_t buf;
q31_t mask;
while (i)
{
in = read_q15x2_ia(&input);
/* extract the first bit */
buf = __ROR(in & 0x80008000, 15);
/* if MSB=1, mask will be 0xFF, 0x0 otherwise */
mask = __QSUB16(0x00000000, buf);
arm_nn_write_q15x2_ia(&output, in & (~mask));
i--;
}
if (size & 0x1)
{
if (*input < 0)
{
*input = 0;
}
input++;
}
#else
/* Run the following code as reference implementation for M cores without DSP extension */
uint16_t i;
for (i = 0; i < size; i++)
{
if (data[i] < 0)
data[i] = 0;
}
#endif /* ARM_MATH_DSP */
}
/**
* @} end of Acti group
*/

View File

@ -1,109 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_relu_q7.c
* Description: Q7 version of ReLU
*
* $Date: 20. July 2021
* $Revision: V.1.1.3
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief Q7 RELU function
* @param[in,out] data pointer to input
* @param[in] size number of elements
*
* @details
*
* Optimized relu with QSUB instructions.
*
*/
void arm_relu_q7(q7_t *data, uint16_t size)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for M cores with DSP extension */
uint16_t i = size >> 2;
q7_t *input = data;
q7_t *output = data;
q31_t in;
q31_t buf;
q31_t mask;
while (i)
{
in = arm_nn_read_q7x4_ia((const q7_t **)&input);
/* extract the first bit */
buf = (int32_t)__ROR((uint32_t)in & 0x80808080, 7);
/* if MSB=1, mask will be 0xFF, 0x0 otherwise */
mask = __QSUB8(0x00000000, buf);
arm_nn_write_q7x4_ia(&output, in & (~mask));
i--;
}
i = size & 0x3;
while (i)
{
if (*input < 0)
{
*input = 0;
}
input++;
i--;
}
#else
/* Run the following code as reference implementation for cores without DSP extension */
uint16_t i;
for (i = 0; i < size; i++)
{
if (data[i] < 0)
data[i] = 0;
}
#endif
}
/**
* @} end of Acti group
*/

View File

@ -1,31 +0,0 @@
#
# Copyright (c) 2019-2021 Arm Limited. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
project(CMSISNNBasicMaths)
file(GLOB SRC "./*_*.c")
add_library(CMSISNNBasicMaths STATIC ${SRC})
### Includes
target_include_directories(CMSISNNBasicMaths PUBLIC "${NN}/Include")
target_include_directories(CMSISNNBasicMaths PUBLIC "${ROOT}/CMSIS/Core/Include")
target_include_directories(CMSISNNBasicMaths PUBLIC "${ROOT}/CMSIS/DSP/Include")

View File

@ -1,105 +0,0 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_elementwise_add_s16
* Description: Elementwise add
*
* $Date: 14 Februari 2022
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup BasicMath
* @{
*/
/*
* s16 elementwise add
*
* Refer header file for details.
*
*/
/* Note: __SHIFT is expected to be <=0 */
arm_status arm_elementwise_add_s16(const int16_t *input_1_vect,
const int16_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_1_mult,
const int32_t input_1_shift,
const int32_t input_2_offset,
const int32_t input_2_mult,
const int32_t input_2_shift,
const int32_t left_shift,
int16_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const int32_t block_size)
{
(void)input_1_offset;
(void)input_2_offset;
(void)out_offset;
int32_t loop_count;
int32_t input_1;
int32_t input_2;
int32_t sum;
loop_count = block_size;
while (loop_count > 0)
{
/* C = A + B */
input_1 = *input_1_vect++ << left_shift;
input_2 = *input_2_vect++ << left_shift;
input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
sum = arm_nn_requantize(sum, out_mult, out_shift);
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
*output++ = (int16_t)sum;
/* Decrement loop counter */
loop_count--;
}
return (ARM_MATH_SUCCESS);
}
/**
* @} end of BasicMath group
*/

View File

@ -1,255 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_elementwise_add_s8
* Description: Element wise add
*
* $Date: 01. March 2021
* $Revision: V.2.5.3
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
#if defined(ARM_MATH_MVEI)
#include "arm_helium_utils.h"
#endif
#if defined(ARM_MATH_MVEI)
#define SAT_INPUT_VECT(__INPUT_V, __MULT, __SHIFT) \
__INPUT_V = arm_doubling_high_mult_mve(__INPUT_V, __MULT); \
__INPUT_V = arm_divide_by_power_of_two_mve(__INPUT_V, -__SHIFT);
#endif
/**
* @note The *_no_sat API does not mean that the input not saturated, Since
* __MULT is a positive integer, it is saturated. The API definition
* has more info about it.
*/
#define SAT_INPUT(__INPUT, __MULT, __SHIFT) \
__INPUT = arm_nn_doubling_high_mult_no_sat(__INPUT, __MULT); \
__INPUT = arm_nn_divide_by_power_of_two(__INPUT, -__SHIFT);
/**
* @ingroup groupNN
*/
/**
* @addtogroup BasicMath
* @{
*/
/*
* s8 element wise add
*
* Refer header file for details.
*
*/
/* Note: __SHIFT is expected to be <=0 */
arm_status arm_elementwise_add_s8(const int8_t *input_1_vect,
const int8_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_1_mult,
const int32_t input_1_shift,
const int32_t input_2_offset,
const int32_t input_2_mult,
const int32_t input_2_shift,
const int32_t left_shift,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const uint32_t block_size)
{
#if defined(ARM_MATH_MVEI)
int32_t count = (int32_t)block_size;
while (count > 0)
{
int32x4_t vect_1;
int32x4_t vect_2;
mve_pred16_t p = vctp32q((uint32_t)count);
vect_1 = vldrbq_z_s32(input_1_vect, p);
vect_2 = vldrbq_z_s32(input_2_vect, p);
vect_1 = vaddq_s32(vect_1, vdupq_n_s32(input_1_offset));
vect_2 = vaddq_s32(vect_2, vdupq_n_s32(input_2_offset));
vect_1 = vshlq_r_s32(vect_1, left_shift);
vect_2 = vshlq_r_s32(vect_2, left_shift);
SAT_INPUT_VECT(vect_1, input_1_mult, input_1_shift);
SAT_INPUT_VECT(vect_2, input_2_mult, input_2_shift);
vect_1 = vaddq_s32(vect_1, vect_2);
SAT_INPUT_VECT(vect_1, out_mult, out_shift);
vect_1 = vaddq_n_s32(vect_1, out_offset);
vect_1 = vmaxq_s32(vect_1, vdupq_n_s32(out_activation_min));
vect_1 = vminq_s32(vect_1, vdupq_n_s32(out_activation_max));
input_1_vect += 4;
input_2_vect += 4;
vstrbq_p_s32(output, vect_1, p);
output += 4;
count -= 4;
}
#else
uint32_t loop_count;
int32_t input_1;
int32_t input_2;
int32_t sum;
#if defined(ARM_MATH_DSP)
int32_t a_1, b_1, a_2, b_2;
int32_t offset_1_packed, offset_2_packed;
int8_t r1, r2, r3, r4;
offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL);
offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL);
loop_count = block_size >> 2;
while (loop_count > 0U)
{
/* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
intrinsic */
input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
a_1 = __SADD16(a_1, offset_1_packed);
b_1 = __SADD16(b_1, offset_1_packed);
a_2 = __SADD16(a_2, offset_2_packed);
b_2 = __SADD16(b_2, offset_2_packed);
/* Sum 1 */
input_1 = (b_1 & 0x0FFFF) << left_shift;
SAT_INPUT(input_1, input_1_mult, input_1_shift);
input_2 = (b_2 & 0x0FFFF) << left_shift;
SAT_INPUT(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r1 = (q7_t)sum;
/* Sum 3 */
input_1 = ((b_1 >> 16) & 0x0FFFF) << left_shift;
SAT_INPUT(input_1, input_1_mult, input_1_shift);
input_2 = ((b_2 >> 16) & 0x0FFFF) << left_shift;
SAT_INPUT(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r3 = (q7_t)sum;
/* Sum 2 */
input_1 = (a_1 & 0x0FFFF) << left_shift;
SAT_INPUT(input_1, input_1_mult, input_1_shift);
input_2 = (a_2 & 0x0FFFF) << left_shift;
SAT_INPUT(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r2 = (q7_t)sum;
/* Sum 4 */
input_1 = ((a_1 >> 16) & 0x0FFFF) << left_shift;
SAT_INPUT(input_1, input_1_mult, input_1_shift);
input_2 = ((a_2 >> 16) & 0x0FFFF) << left_shift;
SAT_INPUT(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r4 = (q7_t)sum;
write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4));
loop_count--;
}
loop_count = block_size & 0x3;
#else
loop_count = block_size;
#endif
while (loop_count > 0U)
{
/* C = A + B */
input_1 = (*input_1_vect++ + input_1_offset) << left_shift;
input_2 = (*input_2_vect++ + input_2_offset) << left_shift;
input_1 = arm_nn_doubling_high_mult(input_1, input_1_mult);
input_1 = arm_nn_divide_by_power_of_two(input_1, -input_1_shift);
input_2 = arm_nn_doubling_high_mult(input_2, input_2_mult);
input_2 = arm_nn_divide_by_power_of_two(input_2, -input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
*output++ = (q7_t)sum;
/* Decrement loop counter */
loop_count--;
}
#endif /* ARM_MATH_MVEI */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of BasicMath group
*/

View File

@ -1,95 +0,0 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_elementwise_mul_s16
* Description: Element wise multiplication
*
* $Date: 14 Februari 2022
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup BasicMath
* @{
*/
/**
* @brief s16 element wise multiplication of two vectors
*
* @note Refer header file for details.
*
*/
arm_status arm_elementwise_mul_s16(const int16_t *input_1_vect,
const int16_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_2_offset,
int16_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const int32_t block_size)
{
(void)input_1_offset;
(void)input_2_offset;
(void)out_offset;
int32_t loop_count;
int32_t input_1;
int32_t input_2;
int32_t mul_res;
loop_count = block_size;
while (loop_count > 0)
{
/* C = A * B */
input_1 = *input_1_vect++;
input_2 = *input_2_vect++;
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift);
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
*output++ = (int16_t)mul_res;
/* Decrement loop counter */
loop_count--;
}
return ARM_MATH_SUCCESS;
}
/**
* @} end of BasicMath group
*/

View File

@ -1,200 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_elementwise_mul_s8
* Description: Element wise multiplication
*
* $Date: January 26, 2021
* $Revision: V.1.0.5
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup BasicMath
* @{
*/
/**
* @brief s8 element wise multiplication of two vectors
*
* @note Refer header file for details.
*
*/
arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
const int8_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_2_offset,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const uint32_t block_size)
{
int32_t loop_count;
#if defined(ARM_MATH_MVEI)
loop_count = (block_size + 3) / 4;
uint32_t num_elements = block_size;
for (int i = 0; i < loop_count; i++)
{
mve_pred16_t p = vctp32q(num_elements);
int32x4_t input_1 = vldrbq_z_s32(input_1_vect, p);
input_1 = vaddq_n_s32(input_1, input_1_offset);
int32x4_t input_2 = vldrbq_z_s32(input_2_vect, p);
input_2 = vaddq_n_s32(input_2, input_2_offset);
int32x4_t res_0 = vmulq_s32(input_1, input_2);
res_0 = arm_requantize_mve_32x4(res_0, vdupq_n_s32(out_mult), vdupq_n_s32(out_shift));
res_0 += vdupq_n_s32(out_offset);
res_0 = vmaxq_s32(res_0, vdupq_n_s32(out_activation_min));
res_0 = vminq_s32(res_0, vdupq_n_s32(out_activation_max));
vstrbq_p_s32(output, res_0, p);
input_1_vect += 4;
input_2_vect += 4;
output += 4;
num_elements -= 4;
}
#else
int32_t input_1;
int32_t input_2;
int32_t mul_res;
#if defined(ARM_MATH_DSP)
int32_t a_1, b_1, a_2, b_2;
int32_t offset_1_packed, offset_2_packed;
int8_t r1, r2, r3, r4;
offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL);
offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL);
loop_count = block_size >> 2;
while (loop_count > 0)
{
/* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
intrinsic */
input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
a_1 = __SADD16(a_1, offset_1_packed);
b_1 = __SADD16(b_1, offset_1_packed);
a_2 = __SADD16(a_2, offset_2_packed);
b_2 = __SADD16(b_2, offset_2_packed);
/* Mul 1 */
input_1 = (int16_t)(b_1 & 0x0FFFFL);
input_2 = (int16_t)(b_2 & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r1 = (q7_t)mul_res;
/* Mul 3 */
input_1 = (int16_t)((b_1 >> 16U) & 0x0FFFFL);
input_2 = (int16_t)((b_2 >> 16U) & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r3 = (q7_t)mul_res;
/* Mul 2 */
input_1 = (int16_t)(a_1 & 0x0FFFFL);
input_2 = (int16_t)(a_2 & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r2 = (q7_t)mul_res;
/* Mul 4 */
input_1 = (int16_t)((a_1 >> 16U) & 0x0FFFFL);
input_2 = (int16_t)((a_2 >> 16U) & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r4 = (q7_t)mul_res;
write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4));
loop_count--;
}
loop_count = block_size & 0x3;
#else
loop_count = block_size;
#endif
while (loop_count > 0)
{
/* C = A * B */
input_1 = *input_1_vect++ + input_1_offset;
input_2 = *input_2_vect++ + input_2_offset;
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
*output++ = (q7_t)mul_res;
/* Decrement loop counter */
loop_count--;
}
#endif
return ARM_MATH_SUCCESS;
}
/**
* @} end of BasicMath group
*/

View File

@ -1,98 +0,0 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
SET(ROOT ${CMSIS_PATH})
# Select which parts of the CMSIS-DSP must be compiled.
# There are some dependencies between the parts but they are not tracked
# by this cmake. So, enabling some functions may require to enable some
# other ones.
option(CONCATENATION "Concatenation" ON)
option(FULLYCONNECTED "Fully Connected" ON)
option(CONVOLUTION "Convolutions" ON)
option(ACTIVATION "Activations" ON)
option(POOLING "Pooling" ON)
option(SOFTMAX "Softmax" ON)
option(BASICMATHSNN "Basic Maths for NN" ON)
option(RESHAPE "Reshape" ON)
option(SVDF "SVDF" ON)
# When OFF it is the default behavior : all tables are included.
option(NNSUPPORT "NN Support" ON)
###########################
#
# CMSIS NN
#
###########################
# NN Sources
SET(NN ${ROOT}/CMSIS/NN)
list(APPEND CMAKE_MODULE_PATH ${NN}/Source)
add_library(cmsis-nn STATIC)
target_compile_options(cmsis-nn PRIVATE -Ofast)
### Includes
target_include_directories(cmsis-nn PUBLIC "${NN}/Include")
target_include_directories(cmsis-nn PUBLIC "${ROOT}/CMSIS/Core/Include")
target_include_directories(cmsis-nn PUBLIC "${ROOT}/CMSIS/DSP/Include")
if (BASICMATHSNN)
add_subdirectory(BasicMathFunctions)
endif()
if (CONCATENATION)
add_subdirectory(ConcatenationFunctions)
endif()
if (FULLYCONNECTED)
add_subdirectory(FullyConnectedFunctions)
endif()
if (CONVOLUTION)
add_subdirectory(ConvolutionFunctions)
endif()
if (ACTIVATION)
add_subdirectory(ActivationFunctions)
endif()
if (POOLING)
add_subdirectory(PoolingFunctions)
endif()
if (SOFTMAX)
add_subdirectory(SoftmaxFunctions)
endif()
if (SVDF)
add_subdirectory(SVDFunctions)
endif()
if (RESHAPE)
add_subdirectory(ReshapeFunctions)
endif()
# Keep NNSUPPORT at the end
if (NNSUPPORT)
add_subdirectory(NNSupportFunctions)
endif()

View File

@ -1,20 +0,0 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_*.c")
target_sources(cmsis-nn PRIVATE ${SRC})

View File

@ -1,66 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_w.c
* Description: s8 version of concatenation along the W axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the W axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_w(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint32_t offset_w)
{
const uint32_t input_copy_size = input_x * input_y * input_z * input_w;
output += offset_w * (input_x * input_y * input_z);
arm_memcpy_q7(output, input, input_copy_size);
}
/**
* @} end of Concatenation group
*/

View File

@ -1,75 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_x.c
* Description: s8 version of concatenation along the X axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the X axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_x(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_x,
const uint32_t offset_x)
{
const uint32_t num_iterations = input_y * input_z * input_w;
output += offset_x;
uint32_t i;
// Copy per row
for (i = 0; i < num_iterations; ++i)
{
arm_memcpy_q7(output, input, input_x);
input += input_x;
output += output_x;
}
}
/**
* @} end of Concatenation group
*/

View File

@ -1,76 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_y.c
* Description: s8 version of concatenation along the Y axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the Y axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_y(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_y,
const uint32_t offset_y)
{
const uint32_t num_iterations = input_z * input_w;
const uint32_t input_copy_size = input_x * input_y;
const uint32_t output_stride = input_x * output_y;
output += offset_y * input_x;
uint32_t i;
// Copy per tile
for (i = 0; i < num_iterations; ++i)
{
arm_memcpy_q7(output, input, input_copy_size);
input += input_copy_size;
output += output_stride;
}
}
/**
* @} end of Concatenation group
*/

View File

@ -1,75 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_z.c
* Description: s8 version of concatenation along the Z axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the Z axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_z(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_z,
const uint32_t offset_z)
{
const uint32_t input_copy_size = input_x * input_y * input_z;
const uint32_t output_stride = input_x * input_y * output_z;
output += offset_z * (input_x * input_y);
uint32_t i;
for (i = 0; i < input_w; ++i)
{
arm_memcpy_q7(output, input, input_copy_size);
input += input_copy_size;
output += output_stride;
}
}
/**
* @} end of Concatenation group
*/

View File

@ -1,24 +0,0 @@
#
# Copyright (c) 2019-2022 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_s8*.c")
file(GLOB SRC_S16 "./*_s16*.c")
target_sources(cmsis-nn PRIVATE ${SRC} ${SRC_S16})

View File

@ -1,205 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_1_x_n_s8.c
* Description: s8 version of 1xN convolution using symmetric quantization.
*
* $Date: December 14, 2021
* $Revision: V.2.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* 1xN s8 convolution function.
*
* Refer header file for details.
*
*/
arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
(void)bias_dims;
arm_status status = ARM_MATH_SUCCESS;
if (output_dims->w % 4 != 0)
{
status = ARM_MATH_SIZE_MISMATCH;
goto out;
}
#if defined(ARM_MATH_MVEI)
(void)ctx;
const uint16_t input_x = input_dims->w;
const uint16_t kernel_x = filter_dims->w;
const uint16_t output_x = output_dims->w;
const uint16_t output_ch = output_dims->c;
const uint16_t input_ch = input_dims->c;
const uint16_t pad_x = conv_params->padding.w;
const uint16_t stride_x = conv_params->stride.w;
const int32_t input_offset = conv_params->input_offset;
const int32_t out_offset = conv_params->output_offset;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
for (int i_out_x = 0; i_out_x <= (output_x - 4); i_out_x += 4)
{
int32_t input_begin_idx[4];
int32_t ker_begin_idx[4];
int32_t ker_end_idx[4];
for (int i = 0; i < 4; i++)
{
const int32_t est_input_x_idx = stride_x * (i_out_x + i) - pad_x;
input_begin_idx[i] = MAX(0, est_input_x_idx);
ker_begin_idx[i] = MAX(0, -est_input_x_idx);
ker_end_idx[i] = MIN(kernel_x, input_x - est_input_x_idx);
}
if ((ker_begin_idx[0] != 0) || (ker_end_idx[3] != kernel_x))
{
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32x4_t s_offset;
int32_t acc[4];
{
int32_t sum_row[4];
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[0] - ker_begin_idx[0]) * input_ch,
input_data + input_begin_idx[0] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) +
(ker_begin_idx[0] * input_ch),
&sum_row[0],
&acc[0]);
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[1] - ker_begin_idx[1]) * input_ch,
input_data + input_begin_idx[1] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) +
(ker_begin_idx[1] * input_ch),
&sum_row[1],
&acc[1]);
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[2] - ker_begin_idx[2]) * input_ch,
input_data + input_begin_idx[2] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) +
(ker_begin_idx[2] * input_ch),
&sum_row[2],
&acc[2]);
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[3] - ker_begin_idx[3]) * input_ch,
input_data + input_begin_idx[3] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) +
(ker_begin_idx[3] * input_ch),
&sum_row[3],
&acc[3]);
s_offset = vldrwq_s32(sum_row);
}
int32x4_t res = vldrwq_s32(acc);
s_offset = vmulq_n_s32(s_offset, input_offset);
res = vaddq_s32(res, s_offset);
if (bias_data)
{
res = vaddq_n_s32(res, bias_data[i_out_ch]);
}
res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
res = vaddq_n_s32(res, out_offset);
res = vmaxq_s32(res, vdupq_n_s32(out_activation_min));
res = vminq_s32(res, vdupq_n_s32(out_activation_max));
const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3};
vstrbq_scatter_offset_s32(output_data, scatter_offset, res);
output_data++;
}
output_data += (3 * output_ch);
}
else
{
output_data = arm_nn_mat_mul_core_4x_s8(kernel_x * input_ch,
stride_x * input_ch,
input_data + input_begin_idx[0] * input_ch,
filter_data,
output_ch,
conv_params,
quant_params,
bias_data,
output_data);
}
}
#else
status = arm_convolve_s8(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
#endif
out:
/* Return to application */
return status;
}
int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
#if !defined(ARM_MATH_MVEI)
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
#else
(void)input_dims;
(void)filter_dims;
return 0;
#endif
}
/**
* @} end of NNConv group
*/

View File

@ -1,235 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_1x1_HWC_q7_fast_nonsquare.c
* Description: Fast Q7 version of 1x1 convolution (non-square shape)
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is optimized for convolution with 1x1 kernel size (i.e., dim_kernel_x=1
* and dim_kernel_y=1). It can be used for the second half of MobileNets [1] after depthwise
* separable convolution.
*
* This function is the version with full list of optimization tricks, but with
* some constraints:
* ch_im_in is multiple of 4
* ch_im_out is multiple of 2
*
* [1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications
* https://arxiv.org/abs/1704.04861
*/
arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
(void)dim_im_in_y;
int16_t i_out_y, i_out_x;
int16_t i_ch_out;
/* -----------------------
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1 || padding_x != 0 ||
padding_y != 0 || stride_x != 1 || stride_y != 1)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_out_y * dim_im_in_x + i_out_x) * ch_im_in, pBuffer, ch_im_in);
pBuffer += ch_im_in;
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* check if there is left-over for compute */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
{
q31_t sum = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = bufferA;
/* basically each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad_reordered(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q7_t)__SSAT((sum >> out_shift), 8);
pOut++;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1 || padding_x != 0 ||
padding_y != 0 || stride_x != 1 || stride_y != 1)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
// if-for implementation
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_y + n) * ch_im_in +
l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,161 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_1x1_s8_fast.c
* Description: Fast q7 version of 1x1 convolution (non-square shape)
*
* $Date: 12. November 2021
* $Revision: V.2.0.4
*
* Target Processor: Cortex-M Processors
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
#include <stdio.h>
#define DIM_KER_X (1U)
#define DIM_KER_Y (1U)
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Fast s8 version for 1x1 convolution (non-square shape)
*
* Refer header file for details.
*
*/
arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
if (input_dims->c % 4 != 0 || conv_params->padding.w != 0 || conv_params->padding.h != 0 ||
conv_params->stride.w != 1 || conv_params->stride.h != 1)
{
return ARM_MATH_SIZE_MISMATCH;
}
(void)ctx;
(void)filter_dims;
(void)bias_dims;
#if defined(ARM_MATH_MVEI)
const int32_t col_len = input_dims->w * input_dims->h * input_dims->n;
const int32_t output_ch = output_dims->c;
const int32_t input_ch = input_dims->c;
const int32_t input_offset = conv_params->input_offset;
const int32_t out_offset = conv_params->output_offset;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
for (int i_items = 0; i_items <= (col_len - 4); i_items += 4)
{
output_data = arm_nn_mat_mul_core_4x_s8(input_ch,
input_ch,
input_data + i_items * input_ch,
filter_data,
output_ch,
conv_params,
quant_params,
bias_data,
output_data);
}
/* Handle left over elements */
for (int i_items = (col_len & ~0x3); i_items < col_len; i_items++)
{
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32_t sum_row = 0;
int32_t acc;
(void)arm_nn_mat_mul_core_1x_s8(
input_ch, input_data + i_items * input_ch, filter_data + i_out_ch * input_ch, &sum_row, &acc);
if (bias_data)
{
acc += bias_data[i_out_ch];
}
sum_row = (sum_row * input_offset);
acc += sum_row;
acc = arm_nn_requantize(acc, output_mult[i_out_ch], output_shift[i_out_ch]);
acc += out_offset;
acc = MAX(acc, out_activation_min);
acc = MIN(acc, out_activation_max);
*output_data++ = acc;
}
}
#else
/* Run the following code as reference implementation for Cortex-M processors with or without DSP extension */
const int32_t lhs_rows = input_dims->w * input_dims->h * input_dims->n;
const int32_t rhs_rows = output_dims->c;
const int32_t rhs_cols = input_dims->c;
arm_nn_mat_mult_nt_t_s8(input_data,
filter_data,
bias_data,
output_data,
quant_params->multiplier,
quant_params->shift,
lhs_rows,
rhs_rows,
rhs_cols,
conv_params->input_offset,
conv_params->output_offset,
conv_params->activation.min,
conv_params->activation.max);
#endif
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims)
{
(void)input_dims;
return 0;
}
/**
* @} end of NNConv group
*/

View File

@ -1,209 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q15_basic.c
* Description: Q15 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Basic Q15 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* This basic version is designed to work for any input tensor and weight
* dimension.
*/
arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q15_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q15_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
uint16_t im2col_out_pixel_index = 0;
q15_t *pBuffer = bufferA;
q15_t *pOut = Im_out;
q15_t *im_buffer = bufferA;
const q15_t *pA;
int i;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* Filling 0 for out-of-bound paddings */
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer,
* ch_im_in); */
memcpy(pBuffer,
(q15_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in,
sizeof(q15_t) * ch_im_in);
}
pBuffer += ch_im_in;
}
}
pA = wt;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = im_buffer;
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inA2 = arm_nn_read_q15x2_ia(&pA);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q15_t)__SSAT((sum >> out_shift), 16);
pOut++;
}
/* counter reset */
pBuffer = im_buffer;
im2col_out_pixel_index++;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,259 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q15_fast.c
* Description: Fast Q15 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q15 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in is multiple of 2
*
* ch_im_out is multiple of 2
*
* dim_im_out is a multiple of 2
*
*/
arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q15_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q15_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
q15_t *pBuffer = bufferA;
q15_t *im_buffer = bufferA;
q15_t *pOut = Im_out;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0 || dim_im_out & 0x1)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/* Run the following code for Cortex-M4 and Cortex-M7 */
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer,
* ch_im_in); */
memcpy(pBuffer,
(q15_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in,
sizeof(q15_t) * ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (i_out_x & 0x1)
{
int i;
/* initialize the matrix pointers for A */
const q15_t *pA = wt;
/* set up the second output pointers */
q15_t *pOut2 = pOut + ch_im_out;
/* this loop over rows in A */
for (i = 0; i < ch_im_out; i += 2)
{
/* setup pointers for B */
const q15_t *pB = im_buffer;
const q15_t *pB2 = pB + ch_im_in * dim_kernel * dim_kernel;
/* aling the second pointer for A */
const q15_t *pA2 = pA + ch_im_in * dim_kernel * dim_kernel;
/* init the sum with bias */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 1;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inA2 = arm_nn_read_q15x2_ia(&pA2);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA1, inB1, sum);
sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x1;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inB1 = *pB++;
q15_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q15_t)__SSAT(sum >> out_shift, 16);
*pOut++ = (q15_t)__SSAT(sum3 >> out_shift, 16);
*pOut2++ = (q15_t)__SSAT(sum2 >> out_shift, 16);
*pOut2++ = (q15_t)__SSAT(sum4 >> out_shift, 16);
/* skip the row computed with A2 */
pA += ch_im_in * dim_kernel * dim_kernel;
} /* for over ch_im_out */
pOut += ch_im_out;
/* counter reset */
pBuffer = im_buffer;
}
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,270 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q15_fast.c
* Description: Fast Q15 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q15 convolution function (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in is multiple of 2
*
* ch_im_out is multiple of 2
*
*/
arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q15_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q15_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
q15_t *pBuffer = bufferA;
q15_t *im_buffer = bufferA;
q15_t *pOut = Im_out;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/* Run the following code for Cortex-M4 and Cortex-M7 */
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer,
* ch_im_in); */
memcpy(pBuffer,
(q15_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
sizeof(q15_t) * ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (i_out_x & 0x1)
{
int i;
/* initialize the matrix pointers for A */
const q15_t *pA = wt;
/* set up the second output pointers */
q15_t *pOut2 = pOut + ch_im_out;
/* this loop over rows in A */
for (i = 0; i < ch_im_out; i += 2)
{
/* setup pointers for B */
const q15_t *pB = im_buffer;
const q15_t *pB2 = pB + ch_im_in * dim_kernel_y * dim_kernel_x;
/* aling the second pointer for A */
const q15_t *pA2 = pA + ch_im_in * dim_kernel_y * dim_kernel_x;
/* init the sum with bias */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 1;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inA2 = arm_nn_read_q15x2_ia(&pA2);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA1, inB1, sum);
sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x1;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inB1 = *pB++;
q15_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q15_t)__SSAT(sum >> out_shift, 16);
*pOut++ = (q15_t)__SSAT(sum3 >> out_shift, 16);
*pOut2++ = (q15_t)__SSAT(sum2 >> out_shift, 16);
*pOut2++ = (q15_t)__SSAT(sum4 >> out_shift, 16);
/* skip the row computed with A2 */
pA += ch_im_in * dim_kernel_y * dim_kernel_x;
} /* for over ch_im_out */
pOut += ch_im_out;
/* counter reset */
pBuffer = im_buffer;
}
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_x * dim_kernel_y + (m * dim_kernel_x + n) * ch_im_in +
l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,280 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_RGB.c
* Description: Q7 version of convolution for RGB image
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Q7 convolution function for RGB image
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in equals 3
*
* This kernel is written exclusively for convolution with ch_im_in
* equals 3. This applies on the first layer of CNNs which has input
* image with RGB format.
*/
arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
// check if number of input channels is 3
if (ch_im_in != 3)
{
return ARM_MATH_SIZE_MISMATCH;
}
// This part implements the im2col function
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* Equivalent to arm_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */
arm_memset_q7((q7_t *)pBuffer, (q7_t)0, 3 * sizeof(q15_t));
pBuffer += 3;
}
else
{
/*
* Equivalent to:
* arm_q7_to_q15_no_shift( (q7_t*)Im_in+(i_ker_y*dim_im_in+i_ker_x)*3, pBuffer, 3);
*/
const q7_t *pPixel = Im_in + (i_ker_y * dim_im_in + i_ker_x) * 3;
q31_t buf = arm_nn_read_q7x4(pPixel);
union arm_nnword top;
union arm_nnword bottom;
top.word = __SXTB16(buf);
bottom.word = __SXTB16(__ROR(buf, 8));
#ifndef ARM_MATH_BIG_ENDIAN
/*
* little-endian, | omit | 3rd | 2nd | 1st |
* MSB LSB
* top | 3rd | 1st |; bottom | omit | 2nd |
*
* version 1, need to swap 2nd and 3rd weight
* *__SIMD32(pBuffer) = top.word;
* *(pBuffer+2) = bottom.half_words[0];
*
* version 2, no weight shuffling required
*/
*pBuffer++ = top.half_words[0];
int32_t packed_word = __PKHBT(bottom.word, top.word, 0);
arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4);
#else
/*
* big-endian, | 1st | 2nd | 3rd | omit |
* MSB LSB
* top | 2nd | omit |; bottom | 1st | 3rd |
*
* version 1, need to swap 2nd and 3rd weight
* *__SIMD32(pBuffer) = bottom.word;
* *(pBuffer+2) = top.half_words[1];
*
* version 2, no weight shuffling required
*/
*pBuffer++ = bottom.half_words[0];
int32_t packed_word = __PKHTB(top.word, bottom.word, 0);
arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4);
#endif
pBuffer += 2;
}
}
}
if (pBuffer == bufferA + 2 * 3 * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15(
wt, bufferA, ch_im_out, 3 * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* left-over because odd number of output pixels */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q15_t *pB = bufferA;
/* basically each time it process 4 entries */
uint16_t colCnt = 3 * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia((const q15_t **)&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia((const q15_t **)&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = 3 * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
// check if number of input channels is 3
if (ch_im_in != 3)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
/* if-for implementation */
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of NNConv group
*/

View File

@ -1,227 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_basic.c
* Description: Q7 version of convolution
*
* $Date: 20. July 2021
* $Revision: V.1.1.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Basic Q7 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* This basic version is designed to work for any input tensor and weight
* dimension.
*/
arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* Filling 0 for out-of-bound paddings */
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* Copying the pixel data to column */
arm_q7_to_q15_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* Computation is filed for every 2 columns */
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* left-over because odd number of output pixels */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
/* Load the accumulator with bias first */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
/* Point to the beging of the im2col buffer */
const q15_t *pB = bufferA;
/* Each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
(void)bufferA;
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
// if-for implementation
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,229 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_basic.c
* Description: Q7 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Basic Q7 convolution function (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*/
arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* Filling 0 for out-of-bound paddings */
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* Copying the pixel data to column */
arm_q7_to_q15_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* Computation is filed for every 2 columns */
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_y * dim_kernel_x)
{
pOut = arm_nn_mat_mult_kernel_q7_q15(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_y * dim_kernel_x, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* left-over because odd number of output pixels */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
/* Load the accumulator with bias first */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
/* Point to the beging of the im2col buffer */
const q15_t *pB = bufferA;
/* Each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
(void)bufferA;
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
// if-for implementation
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in +
l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,380 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_fast.c
* Description: Fast Q7 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q7 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in is multiple of 4 ( because of the SIMD32 read and swap )
*
* ch_im_out is multiple of 2 ( bacause 2x2 mat_mult kernel )
*
* The im2col converts the Q7 tensor input into Q15 column, which is stored in
* bufferA. There is reordering happenning during this im2col process with
* arm_q7_to_q15_reordered_no_shift. For every four elements, the second and
* third elements are swapped.
*
* The computation kernel arm_nn_mat_mult_kernel_q7_q15_reordered does the
* GEMM computation with the reordered columns.
*
* To speed-up the determination of the padding condition, we split the
* computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}.
* This reduces the total number of boundary condition checks and improves
* the data copying performance.
*/
arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/*
* Here we split the entire matrix into three regions depending on the padding situation
* Top: i_out_y from 0 to padding - 1
* Middle: i_out_y from padding to dim_im_out-padding-1
* Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
*/
/* top part */
for (i_out_y = 0; i_out_y < padding; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* middle part, here we also divide the x into left, mid and right */
for (; i_out_y < dim_im_out - padding; i_out_y++)
{
/* left part */
for (i_out_x = 0; i_out_x < padding; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* mid part */
for (; i_out_x < dim_im_out - padding; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
arm_q7_to_q15_reordered_no_shift((q7_t *)Im_in +
(i_ker_y * dim_im_in + i_out_x * stride - padding) * ch_im_in,
pBuffer,
ch_im_in * dim_kernel);
pBuffer += ch_im_in * dim_kernel;
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* right part */
for (; i_out_x < dim_im_out; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
for (; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* check if there is left-over for compute */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = bufferA;
/* each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad_reordered(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q7_t)__SSAT((sum >> out_shift), 8);
pOut++;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
// if-for implementation
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,378 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_fast_nonsquare.c
* Description: Fast Q7 version of convolution (non-sqaure shape)
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q7 convolution function (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is the version with full list of optimization tricks, but with
* some constraints:
* ch_im_in is multiple of 4
* ch_im_out is multiple of 2
*/
arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/* -----------------------
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/*
* Here we split the entire matrix into three regions depending on the padding situation
* Top: i_out_y from 0 to padding - 1
* Middle: i_out_y from padding to dim_im_out-padding-1
* Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
*/
/* top part */
for (i_out_y = 0; i_out_y < padding_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* middle part, here we also divide the x into left, mid and right */
for (; i_out_y < dim_im_out_y - padding_y; i_out_y++)
{
/* left part */
for (i_out_x = 0; i_out_x < padding_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* mid part */
for (; i_out_x < dim_im_out_x - padding_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_out_x * stride_x - padding_x) * ch_im_in,
pBuffer,
ch_im_in * dim_kernel_x);
pBuffer += ch_im_in * dim_kernel_x;
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* right part */
for (; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
for (; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* check if there is left-over for compute */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = bufferA;
/* basically each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad_reordered(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = (ch_im_in * dim_kernel_y * dim_kernel_x) & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q7_t)__SSAT((sum >> out_shift), 8);
pOut++;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
/* if-for implementation */
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in +
l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,241 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_fast_s16.c
* Description: Optimized s16 version of convolution.
*
* $Date: 12 August 2021
* $Revision: V.1.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Basic s16 convolution function.
*
* Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
* are multiples of 4 or atleast greater than 4.
*
*/
arm_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data)
{
(void)bias_dims;
if (filter_dims->w * filter_dims->h * input_dims->c >= 512)
{
return ARM_MATH_SIZE_MISMATCH;
}
if (ctx->buf == NULL && arm_convolve_s8_get_buffer_size(input_dims, filter_dims) > 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
q15_t *buffer_a = (q15_t *)ctx->buf;
const int32_t input_batches = input_dims->n;
const int32_t input_x = input_dims->w;
const int32_t input_y = input_dims->h;
const int32_t input_ch = input_dims->c;
const int32_t kernel_x = filter_dims->w;
const int32_t kernel_y = filter_dims->h;
const int32_t output_x = output_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_ch = output_dims->c;
const int32_t pad_x = conv_params->padding.w;
const int32_t pad_y = conv_params->padding.h;
const int32_t stride_x = conv_params->stride.w;
const int32_t stride_y = conv_params->stride.h;
const int16_t out_activation_min = conv_params->activation.min;
const int16_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Generate two columns from the input tensor a GEMM computation */
q15_t *two_column_buf = buffer_a;
q15_t *out = output_data;
/* This part implements the im2col function */
for (int32_t i_out_y = 0; i_out_y < output_y; i_out_y++)
{
for (int32_t i_out_x = 0; i_out_x < output_x; i_out_x++)
{
for (int32_t i_ker_y = i_out_y * stride_y - pad_y; i_ker_y < i_out_y * stride_y - pad_y + kernel_y;
i_ker_y++)
{
for (int32_t i_ker_x = i_out_x * stride_x - pad_x; i_ker_x < i_out_x * stride_x - pad_x + kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
{
/* Filling 0 for out-of-bound paddings */
arm_memset_q7((q7_t *)two_column_buf, 0, sizeof(q15_t) * input_ch);
}
else
{
arm_memcpy_q7((q7_t *)two_column_buf,
(const q7_t *)(input_data + (i_ker_y * input_x + i_ker_x) * input_ch),
input_ch * sizeof(q15_t));
}
two_column_buf += input_ch;
}
}
/* Computation is filed for every 2 columns */
if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
{
out = arm_nn_mat_mult_kernel_s16(filter_data,
buffer_a,
output_ch,
output_shift,
output_mult,
out_activation_min,
out_activation_max,
(input_ch * kernel_y * kernel_x),
bias_data,
out);
/* Counter reset */
two_column_buf = buffer_a;
}
}
}
/* Left-over because odd number of output pixels */
if (two_column_buf != buffer_a)
{
const q7_t *ker_a = filter_data;
int i;
for (i = 0; i < output_ch; i++)
{
/* Init the accumulator*/
q31_t sum = 0;
/* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
const q15_t *ip_as_col = buffer_a;
/* 4 multiply and accumulates are done in one loop. */
uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
while (col_count)
{
q31_t ker_a1, ker_a2;
q31_t ip_b1, ip_b2;
ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
sum = __SMLAD(ker_a1, ip_b1, sum);
ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
sum = __SMLAD(ker_a2, ip_b2, sum);
col_count--;
}
/* Handle left over mac */
col_count = input_ch * kernel_y * kernel_x & 0x3;
while (col_count)
{
q7_t ker_a1 = *ker_a++;
q15_t ip_b1 = *ip_as_col++;
sum += ker_a1 * ip_b1;
col_count--;
}
if (bias_data)
{
q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i]);
q63_t acc_64 = sum + bias_data[i];
sum = arm_nn_requantize_s64(acc_64, reduced_multiplier, output_shift[i]);
}
else
{
sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
}
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
*out++ = (q15_t)sum;
}
}
#else
(void)input_data;
(void)output_data;
(void)bias_data;
(void)filter_data;
(void)buffer_a;
(void)kernel_x;
(void)kernel_y;
(void)pad_x;
(void)pad_y;
(void)stride_x;
(void)stride_y;
(void)out_activation_min;
(void)out_activation_max;
(void)output_mult;
(void)output_shift;
return ARM_MATH_ARGUMENT_ERROR;
#endif
/* Advance to the next batch */
input_data += (input_x * input_y * input_ch);
output_data += (output_x * output_y * output_ch);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
#else
(void)input_dims;
(void)filter_dims;
return 0;
#endif
}
/**
* @} end of NNConv group
*/

View File

@ -1,156 +0,0 @@
/*
* Copyright (C) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_s16.c
* Description: s16 version of convolution using symmetric quantization.
*
* $Date: January 13, 2022
* $Revision: V.1.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Basic s16 convolution function.
*
* Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
* are multiples of 4 or atleast greater than 4.
*
*/
arm_status arm_convolve_s16(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data)
{
(void)bias_dims;
(void)ctx;
const int32_t input_batches = input_dims->n;
const int32_t input_x = input_dims->w;
const int32_t input_y = input_dims->h;
const int32_t input_ch = input_dims->c;
const int32_t kernel_x = filter_dims->w;
const int32_t kernel_y = filter_dims->h;
const int32_t output_x = output_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_ch = output_dims->c;
const int32_t pad_x = conv_params->padding.w;
const int32_t pad_y = conv_params->padding.h;
const int32_t stride_x = conv_params->stride.w;
const int32_t stride_y = conv_params->stride.h;
const int32_t dilation_x = conv_params->dilation.w;
const int32_t dilation_y = conv_params->dilation.h;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
for (int32_t i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i_out_ch]);
for (int32_t base_idx_y = -pad_y, i_out_y = 0; i_out_y < output_y; base_idx_y += stride_y, i_out_y++)
{
for (int32_t base_idx_x = -pad_x, i_out_x = 0; i_out_x < output_x; base_idx_x += stride_x, i_out_x++)
{
int64_t conv_out_acc = 0;
const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y;
const int32_t ker_y_start = MAX(0, start_y_max);
const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x;
const int32_t ker_x_start = MAX(0, start_x_max);
const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y;
const int32_t ker_y_end = MIN(kernel_y, end_min_y);
const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x;
const int32_t ker_x_end = MIN(kernel_x, end_min_x);
for (int32_t i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
for (int32_t i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t in_row = base_idx_y + dilation_y * i_ker_y;
const int32_t in_col = base_idx_x + dilation_x * i_ker_x;
for (int32_t i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
conv_out_acc += input_data[(in_row * input_x + in_col) * input_ch + i_input_ch] *
filter_data[i_out_ch * input_ch * kernel_y * kernel_x +
(i_ker_y * kernel_x + i_ker_x) * input_ch + i_input_ch];
}
}
}
if (bias_data)
{
conv_out_acc += bias_data[i_out_ch];
}
int32_t conv_out = arm_nn_requantize_s64(conv_out_acc, reduced_multiplier, output_shift[i_out_ch]);
conv_out = MAX(conv_out, out_activation_min);
conv_out = MIN(conv_out, out_activation_max);
output_data[i_out_ch + (i_out_y * output_x + i_out_x) * output_ch] = (int16_t)conv_out;
}
}
}
/* Advance to the next batch */
input_data += (input_x * input_y * input_ch);
output_data += (output_x * output_y * output_ch);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
(void)input_dims;
(void)filter_dims;
return 0;
}
/**
* @} end of NNConv group
*/

View File

@ -1,335 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_s8.c
* Description: s8 version of convolution using symmetric quantization.
*
* $Date: December 14, 2021
* $Revision: V.2.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Basic s8 convolution function.
*
* Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
* are multiples of 4 or atleast greater than 4.
*
*/
arm_status arm_convolve_s8(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
(void)bias_dims;
if (ctx->buf == NULL && arm_convolve_s8_get_buffer_size(input_dims, filter_dims) > 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
q15_t *buffer_a = (q15_t *)ctx->buf;
const int32_t input_batches = input_dims->n;
const uint16_t input_x = input_dims->w;
const uint16_t input_y = input_dims->h;
const uint16_t input_ch = input_dims->c;
const uint16_t kernel_x = filter_dims->w;
const uint16_t kernel_y = filter_dims->h;
const uint16_t output_x = output_dims->w;
const uint16_t output_y = output_dims->h;
const uint16_t output_ch = output_dims->c;
const uint16_t pad_x = conv_params->padding.w;
const uint16_t pad_y = conv_params->padding.h;
const uint16_t stride_x = conv_params->stride.w;
const uint16_t stride_y = conv_params->stride.h;
const int32_t input_offset = conv_params->input_offset;
const int32_t out_offset = conv_params->output_offset;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
int i_batch;
for (i_batch = 0; i_batch < input_batches; i_batch++)
{
#if defined(ARM_MATH_MVEI)
/* Generate upto four columns from the input tensor a GEMM computation */
q7_t *im2col_buf = (q7_t *)buffer_a;
q7_t *out = output_data;
int32_t buffer_fill_cnt = 0;
int32_t padded = 0;
const int32_t num_elem = kernel_x * kernel_y * input_ch;
const int32_t dilation_x = conv_params->dilation.w;
const int32_t dilation_y = conv_params->dilation.h;
/* This part implements the im2col function */
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int32_t base_idx_x = stride_x * i_out_x - pad_x;
const int32_t base_idx_y = stride_y * i_out_y - pad_y;
for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++)
{
for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
{
const int32_t k_y = base_idx_y + dilation_y * i_ker_y;
const int32_t k_x = base_idx_x + dilation_x * i_ker_x;
if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x)
{
memset(im2col_buf, (int8_t)-input_offset, sizeof(q7_t) * input_ch);
padded = 1;
}
else
{
arm_memcpy_q7(im2col_buf, input_data + (k_y * input_x + k_x) * input_ch, input_ch);
}
im2col_buf += input_ch;
}
}
buffer_fill_cnt++;
/* Computation is filed for every 4 columns */
if (buffer_fill_cnt == 4 && (padded == 0))
{
buffer_fill_cnt = 0;
out = arm_nn_mat_mul_core_4x_s8(num_elem,
num_elem,
(q7_t *)buffer_a,
filter_data,
output_ch,
conv_params,
quant_params,
bias_data,
out);
im2col_buf = (q7_t *)buffer_a;
}
else if (buffer_fill_cnt == 4 && (padded != 0))
{
buffer_fill_cnt = 0;
out = arm_nn_mat_mult_s8(filter_data,
(q7_t *)buffer_a,
output_ch,
4,
output_shift,
output_mult,
out_offset,
input_offset,
0,
out_activation_min,
out_activation_max,
num_elem,
bias_data,
out);
im2col_buf = (q7_t *)buffer_a;
padded = 0;
}
}
}
/* Handle left over columns */
if (buffer_fill_cnt != 0)
{
out = arm_nn_mat_mult_s8(filter_data,
(q7_t *)buffer_a,
output_ch,
buffer_fill_cnt,
output_shift,
output_mult,
out_offset,
input_offset,
0,
out_activation_min,
out_activation_max,
num_elem,
bias_data,
out);
}
#else // #if defined(ARM_MATH_MVEI)
const uint16_t dilation_x = conv_params->dilation.w;
const uint16_t dilation_y = conv_params->dilation.h;
int32_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/* Generate two columns from the input tensor a GEMM computation */
q15_t *two_column_buf = buffer_a;
q7_t *out = output_data;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < output_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int32_t base_idx_y = stride_y * i_out_y - pad_y;
const int32_t base_idx_x = stride_x * i_out_x - pad_x;
for (i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++)
{
for (i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
{
const int32_t k_y = base_idx_y + dilation_y * i_ker_y;
const int32_t k_x = base_idx_x + dilation_x * i_ker_x;
if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x)
{
/* Filling 0 for out-of-bound paddings */
memset(two_column_buf, 0, sizeof(q15_t) * input_ch);
}
else
{
/* Copying the pixel data to column */
arm_q7_to_q15_with_offset(
input_data + (k_y * input_x + k_x) * input_ch, two_column_buf, input_ch, input_offset);
}
two_column_buf += input_ch;
}
}
/* Computation is filed for every 2 columns */
if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
{
out = arm_nn_mat_mult_kernel_s8_s16(filter_data,
buffer_a,
output_ch,
output_shift,
output_mult,
out_offset,
out_activation_min,
out_activation_max,
input_ch * kernel_y * kernel_x,
bias_data,
out);
/* counter reset */
two_column_buf = buffer_a;
}
}
}
/* left-over because odd number of output pixels */
if (two_column_buf != buffer_a)
{
const q7_t *ker_a = filter_data;
int i;
for (i = 0; i < output_ch; i++)
{
/* Load the accumulator with bias first */
q31_t sum = 0;
if (bias_data)
{
sum = bias_data[i];
}
/* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
const q15_t *ip_as_col = buffer_a;
/* 4 multiply and accumulates are done in one loop. */
#if defined(ARM_MATH_DSP)
uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
while (col_count)
{
q31_t ker_a1, ker_a2;
q31_t ip_b1, ip_b2;
ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
sum = __SMLAD(ker_a1, ip_b1, sum);
ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
sum = __SMLAD(ker_a2, ip_b2, sum);
col_count--;
}
/* Handle left over mac */
col_count = input_ch * kernel_y * kernel_x & 0x3;
#else
uint16_t col_count = input_ch * kernel_y * kernel_x;
#endif
while (col_count)
{
q7_t ker_a1 = *ker_a++;
q15_t ip_b1 = *ip_as_col++;
sum += ker_a1 * ip_b1;
col_count--;
}
sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
*out++ = (q7_t)sum;
}
}
#endif // #if defined(ARM_MATH_MVEI)
/* Advance to the next batch */
input_data += (input_x * input_y * input_ch);
output_data += (output_x * output_y * output_ch);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
#if defined(ARM_MATH_MVEI)
int32_t col_length = input_dims->c * filter_dims->w * filter_dims->h;
// Get number of complete int16 lanes(multiple of 8) for given col_length. This is dependent on
// implementation of arm_nn_mat_mult_s8
col_length = (col_length + 7) / 8;
// 4 -> number of im2col buffers, 8 -> 8 elements per Q register
return 4 * col_length * 8 * (int32_t)sizeof(int8_t);
#else
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
#endif
}
/**
* @} end of NNConv group
*/

View File

@ -1,130 +0,0 @@
/*
* Copyright (C) 2021-2022 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_wrapper_s16.c
* Description: s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in
* cmsis-nn to perform the convolution.
*
* $Date: 13 January 2022
* $Revision: V.1.2.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Convolution layer
*
* Refer header file for details.
*
*/
arm_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
if (filter_dims->w * filter_dims->h * input_dims->c < 512 &&
(conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_fast_s16(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
else
{
return arm_convolve_s16(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
#else
return arm_convolve_s16(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
#endif
}
int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims)
{
(void)conv_params;
(void)output_dims;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
if (filter_dims->w * filter_dims->h * input_dims->c < 512 &&
(conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_fast_s16_get_buffer_size(input_dims, filter_dims);
}
return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
#else
return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
#endif
}
/**
* @} end of NNConv group
*/

View File

@ -1,133 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_wrapper_s8.c
* Description: s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in
* cmsis-nn to perform the convolution.
*
* $Date: 02. December 2021
* $Revision: V.1.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Convolution layer
*
* Refer header file for details.
*
*/
arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (input_dims->c % 4 == 0) &&
(conv_params->stride.w == 1) && (conv_params->stride.h == 1) && (filter_dims->w == 1) &&
(filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_1x1_s8_fast(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
else if ((output_dims->h == 1) && (input_dims->h == 1) && (filter_dims->h == 1) && (output_dims->w % 4 == 0) &&
(input_dims->n == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_1_x_n_s8(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
else
{
return arm_convolve_s8(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
}
int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims)
{
if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (input_dims->c % 4 == 0) &&
(conv_params->stride.w == 1) && (conv_params->stride.h == 1) && (filter_dims->w == 1) &&
(filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_1x1_s8_fast_get_buffer_size(input_dims);
}
else if ((output_dims->h == 1) && (input_dims->h == 1) && (filter_dims->h == 1) && (output_dims->w % 4 == 0) &&
(input_dims->n == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_1_x_n_s8_get_buffer_size(input_dims, filter_dims);
}
else
{
return arm_convolve_s8_get_buffer_size(input_dims, filter_dims);
}
}
/**
* @} end of NNConv group
*/

View File

@ -1,212 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_3x3_s8.c
* Description: Optimized s8 depthwise convolution function for channel
* multiplier of 1 and 3x3 kernel size.
*
* $Date: 09. October 2020
* $Revision: V.2.0.1
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Optimized s8 depthwise convolution function with constraint that
* in_channel == out_channel and kernel_x == kernel_y == 3 with pads at most 1
*
* Refer prototype header file for details.
*
*/
arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
(void)ctx;
(void)bias_dims;
const int32_t input_x = input_dims->w;
const int32_t input_y = input_dims->h;
const int32_t input_ch = input_dims->c;
const int32_t output_ch = output_dims->c;
const int32_t pad_x = dw_conv_params->padding.w;
const int32_t pad_y = dw_conv_params->padding.h;
const int32_t stride_x = dw_conv_params->stride.w;
const int32_t stride_y = dw_conv_params->stride.h;
const int32_t *output_shift = quant_params->shift;
const int32_t *output_mult = quant_params->multiplier;
const int32_t output_x = output_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_offset = dw_conv_params->output_offset;
const int32_t input_offset = dw_conv_params->input_offset;
const int32_t output_activation_min = dw_conv_params->activation.min;
const int32_t output_activation_max = dw_conv_params->activation.max;
/* Check input constraints input_ch == output_ch */
if (input_ch != output_ch)
{
return ARM_MATH_SIZE_MISMATCH;
}
/* Check input constraints pad_x <= 1 */
if (pad_x > 1 || filter_dims->w != 3 || filter_dims->h != 3)
{
return ARM_MATH_ARGUMENT_ERROR;
}
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
int32_t in_ch = 0;
int32_t ker_w_start = MAX(0, -in_w);
for (; in_ch <= (input_ch - 4); in_ch += 4)
{
int32_t out_buff0 = bias[in_ch + 0];
int32_t out_buff1 = bias[in_ch + 1];
int32_t out_buff2 = bias[in_ch + 2];
int32_t out_buff3 = bias[in_ch + 3];
const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
{
int32_t in_val = 0;
int32_t ker_val = 0;
if (ker_w_start == 0)
{
in_val = arm_nn_read_q7x4(input_ptr);
ker_val = arm_nn_read_q7x4(kernel_ptr);
out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
}
in_val = arm_nn_read_q7x4(input_ptr + input_ch);
ker_val = arm_nn_read_q7x4(kernel_ptr + input_ch);
out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
if ((input_x - in_w) >= 3)
{
in_val = arm_nn_read_q7x4(input_ptr + (input_ch << 1));
ker_val = arm_nn_read_q7x4(kernel_ptr + (input_ch << 1));
out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
}
input_ptr += (input_ch * input_x);
kernel_ptr += (input_ch * 3);
}
out_buff0 = arm_nn_requantize(out_buff0, output_mult[in_ch + 0], output_shift[in_ch + 0]);
out_buff1 = arm_nn_requantize(out_buff1, output_mult[in_ch + 1], output_shift[in_ch + 1]);
out_buff2 = arm_nn_requantize(out_buff2, output_mult[in_ch + 2], output_shift[in_ch + 2]);
out_buff3 = arm_nn_requantize(out_buff3, output_mult[in_ch + 3], output_shift[in_ch + 3]);
out_buff0 += output_offset;
out_buff1 += output_offset;
out_buff2 += output_offset;
out_buff3 += output_offset;
out_buff0 = MIN(MAX(out_buff0, output_activation_min), output_activation_max);
out_buff1 = MIN(MAX(out_buff1, output_activation_min), output_activation_max);
out_buff2 = MIN(MAX(out_buff2, output_activation_min), output_activation_max);
out_buff3 = MIN(MAX(out_buff3, output_activation_min), output_activation_max);
output[out_idx++] = (int8_t)out_buff0;
output[out_idx++] = (int8_t)out_buff1;
output[out_idx++] = (int8_t)out_buff2;
output[out_idx++] = (int8_t)out_buff3;
}
// Leftover
for (; in_ch < input_ch; ++in_ch)
{
int32_t out_buff = bias[in_ch];
const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
{
if (ker_w_start == 0)
{
out_buff += (*(input_ptr) + input_offset) * *(kernel_ptr);
}
out_buff += (*(input_ptr + input_ch) + input_offset) * *(kernel_ptr + input_ch);
if ((input_x - in_w) >= 3)
{
out_buff += (*(input_ptr + (input_ch << 1)) + input_offset) * *(kernel_ptr + (input_ch << 1));
}
input_ptr += (input_ch * input_x);
kernel_ptr += (input_ch * 3);
}
out_buff = arm_nn_requantize(out_buff, output_mult[in_ch], output_shift[in_ch]);
out_buff += output_offset;
out_buff = MIN(MAX(out_buff, output_activation_min), output_activation_max);
output[out_idx++] = (int8_t)out_buff;
}
}
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,292 +0,0 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_s16.c
* Description: s16 version of depthwise convolution.
*
* $Date: 26. Jan 2022
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
static void __attribute__((unused)) depthwise_conv_s16_mult_4_s16(const int16_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const int8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int64_t *bias,
int16_t *output,
const int32_t *output_shift,
const int32_t *output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch;
++in_ch, out_ch += ch_mult)
{
for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
{
int32_t out_buff32[4] = {REDUCE_MULTIPLIER(output_mult[out_ch + 0 + mult_tile]),
REDUCE_MULTIPLIER(output_mult[out_ch + 1 + mult_tile]),
REDUCE_MULTIPLIER(output_mult[out_ch + 2 + mult_tile]),
REDUCE_MULTIPLIER(output_mult[out_ch + 3 + mult_tile])};
int64_t out_buff[4] = {0, 0, 0, 0};
if (bias)
{
out_buff[0] = bias[out_ch + 0 + mult_tile];
out_buff[1] = bias[out_ch + 1 + mult_tile];
out_buff[2] = bias[out_ch + 2 + mult_tile];
out_buff[3] = bias[out_ch + 3 + mult_tile];
}
for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
{
int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
#pragma clang loop unroll(disable)
#endif
for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w);
++ker_w, ker_idx += output_ch)
{
// TODO: Unroll of 4 with 64 bit accumulator will probably result in too much register
// spills. Try with unroll of 2 when enabling this.
int32_t in_val = input[in_idx + ker_w * input_ch];
out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile];
out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile];
out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile];
out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile];
}
}
out_buff32[0] =
arm_nn_requantize_s64(out_buff[0], out_buff32[0], output_shift[out_ch + 0 + mult_tile]);
out_buff32[1] =
arm_nn_requantize_s64(out_buff[1], out_buff32[1], output_shift[out_ch + 1 + mult_tile]);
out_buff32[2] =
arm_nn_requantize_s64(out_buff[2], out_buff32[2], output_shift[out_ch + 2 + mult_tile]);
out_buff32[3] =
arm_nn_requantize_s64(out_buff[3], out_buff32[3], output_shift[out_ch + 3 + mult_tile]);
out_buff32[0] = MIN(MAX(out_buff32[0], output_activation_min), output_activation_max);
out_buff32[1] = MIN(MAX(out_buff32[1], output_activation_min), output_activation_max);
out_buff32[2] = MIN(MAX(out_buff32[2], output_activation_min), output_activation_max);
out_buff32[3] = MIN(MAX(out_buff32[3], output_activation_min), output_activation_max);
output[out_idx++] = (int16_t)out_buff32[0];
output[out_idx++] = (int16_t)out_buff32[1];
output[out_idx++] = (int16_t)out_buff32[2];
output[out_idx++] = (int16_t)out_buff32[3];
}
}
}
}
}
static void depthwise_conv_s16_generic_s16(const int16_t *input,
const uint16_t input_batches,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_ch,
const int8_t *kernel,
const uint16_t ch_mult,
const uint16_t kernel_x,
const uint16_t kernel_y,
const uint16_t pad_x,
const uint16_t pad_y,
const uint16_t stride_x,
const uint16_t stride_y,
const int64_t *bias,
int16_t *output,
const int32_t *output_shift,
const int32_t *output_mult,
const uint16_t output_x,
const uint16_t output_y,
const int32_t output_activation_min,
const int32_t output_activation_max,
const uint16_t dilation_x,
const uint16_t dilation_y)
{
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
{
const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[idx_out_ch]);
int64_t acc_0 = 0;
int ker_y_start;
int ker_x_start;
int ker_y_end;
int ker_x_end;
if (dilation_x > 1)
{
const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x;
ker_x_start = MAX(0, start_x_max);
const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x;
ker_x_end = MIN(kernel_x, end_min_x);
}
else
{
ker_x_start = MAX(0, -base_idx_x);
ker_x_end = MIN(kernel_x, input_x - base_idx_x);
}
if (dilation_y > 1)
{
const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y;
ker_y_start = MAX(0, start_y_max);
const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y;
ker_y_end = MIN(kernel_y, end_min_y);
}
else
{
ker_y_start = MAX(0, -base_idx_y);
ker_y_end = MIN(kernel_y, input_y - base_idx_y);
}
if (bias)
{
acc_0 = bias[idx_out_ch];
}
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + dilation_y * i_ker_y;
for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t idx_x = base_idx_x + dilation_x * i_ker_x;
int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
acc_0 += input[idx_0] * kernel[ker_idx_0];
}
}
/* Requantize and clamp output to provided range */
int32_t result = arm_nn_requantize_s64(acc_0, reduced_multiplier, output_shift[idx_out_ch]);
result = MAX(result, output_activation_min);
result = MIN(result, output_activation_max);
*output++ = (int16_t)result;
}
}
}
}
/* Advance to the next batch */
input += (input_x * input_y * input_ch);
}
}
/*
* Basic s16 depthwise convolution function.
*
* Refer header file for details.
*
*/
arm_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int64_t *bias,
const cmsis_nn_dims *output_dims,
q15_t *output)
{
const uint16_t dilation_x = dw_conv_params->dilation.w;
const uint16_t dilation_y = dw_conv_params->dilation.h;
(void)bias_dims;
(void)ctx;
depthwise_conv_s16_generic_s16(input,
input_dims->n,
input_dims->w,
input_dims->h,
input_dims->c,
kernel,
dw_conv_params->ch_mult,
filter_dims->w,
filter_dims->h,
dw_conv_params->padding.w,
dw_conv_params->padding.h,
dw_conv_params->stride.w,
dw_conv_params->stride.h,
bias,
output,
quant_params->shift,
quant_params->multiplier,
output_dims->w,
output_dims->h,
dw_conv_params->activation.min,
dw_conv_params->activation.max,
dilation_x,
dilation_y);
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,347 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_s8.c
* Description: s8 version of depthwise convolution.
*
* $Date: 30. Dec 2021
* $Revision: V.2.7.1
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
static void depthwise_conv_s8_mult_4(const int8_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const int8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int32_t *bias,
int8_t *output,
const int32_t *output_shift,
const int32_t *output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch;
++in_ch, out_ch += ch_mult)
{
for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
{
int32_t out_buff[4] = {0, 0, 0, 0};
if (bias)
{
out_buff[0] = bias[out_ch + 0 + mult_tile];
out_buff[1] = bias[out_ch + 1 + mult_tile];
out_buff[2] = bias[out_ch + 2 + mult_tile];
out_buff[3] = bias[out_ch + 3 + mult_tile];
}
for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
{
int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
#pragma clang loop unroll(disable)
#endif
for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w);
++ker_w, ker_idx += output_ch)
{
int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset;
out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile];
out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile];
out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile];
out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile];
}
}
#if defined(ARM_MATH_MVEI)
(void)out_idx;
int32x4_t res = vldrwq_s32(out_buff);
res = arm_requantize_mve_32x4(res,
vldrwq_s32(&output_mult[out_ch + mult_tile]),
vldrwq_s32(&output_shift[out_ch + mult_tile]));
res = vaddq_n_s32(res, output_offset);
res = vmaxq_s32(res, vdupq_n_s32(output_activation_min));
res = vminq_s32(res, vdupq_n_s32(output_activation_max));
vstrbq_s32(output, res);
output += 4;
#else
out_buff[0] = arm_nn_requantize(
out_buff[0], output_mult[out_ch + 0 + mult_tile], output_shift[out_ch + 0 + mult_tile]);
out_buff[1] = arm_nn_requantize(
out_buff[1], output_mult[out_ch + 1 + mult_tile], output_shift[out_ch + 1 + mult_tile]);
out_buff[2] = arm_nn_requantize(
out_buff[2], output_mult[out_ch + 2 + mult_tile], output_shift[out_ch + 2 + mult_tile]);
out_buff[3] = arm_nn_requantize(
out_buff[3], output_mult[out_ch + 3 + mult_tile], output_shift[out_ch + 3 + mult_tile]);
out_buff[0] += output_offset;
out_buff[1] += output_offset;
out_buff[2] += output_offset;
out_buff[3] += output_offset;
out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max);
out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max);
out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max);
out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max);
output[out_idx++] = (int8_t)out_buff[0];
output[out_idx++] = (int8_t)out_buff[1];
output[out_idx++] = (int8_t)out_buff[2];
output[out_idx++] = (int8_t)out_buff[3];
#endif
}
}
}
}
}
static void depthwise_conv_s8_generic(const q7_t *input,
const uint16_t input_batches,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_ch,
const q7_t *kernel,
const uint16_t output_ch,
const uint16_t ch_mult,
const uint16_t kernel_x,
const uint16_t kernel_y,
const uint16_t pad_x,
const uint16_t pad_y,
const uint16_t stride_x,
const uint16_t stride_y,
const int32_t *bias,
q7_t *output,
const int32_t *output_shift,
const int32_t *output_mult,
const uint16_t output_x,
const uint16_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t output_activation_min,
const int32_t output_activation_max,
const uint16_t dilation_x,
const uint16_t dilation_y)
{
(void)output_ch;
int i_out = 0;
int i_batch;
for (i_batch = 0; i_batch < input_batches; i_batch++)
{
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
{
const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
int32_t acc_0 = 0;
int ker_y_start;
int ker_x_start;
int ker_y_end;
int ker_x_end;
if (dilation_x > 1)
{
const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x;
ker_x_start = MAX(0, start_x_max);
const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x;
ker_x_end = MIN(kernel_x, end_min_x);
}
else
{
ker_x_start = MAX(0, -base_idx_x);
ker_x_end = MIN(kernel_x, input_x - base_idx_x);
}
if (dilation_y > 1)
{
const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y;
ker_y_start = MAX(0, start_y_max);
const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y;
ker_y_end = MIN(kernel_y, end_min_y);
}
else
{
ker_y_start = MAX(0, -base_idx_y);
ker_y_end = MIN(kernel_y, input_y - base_idx_y);
}
if (bias)
{
acc_0 = bias[idx_out_ch];
}
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + dilation_y * i_ker_y;
for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t idx_x = base_idx_x + dilation_x * i_ker_x;
int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
acc_0 += (input[idx_0] + input_offset) * kernel[ker_idx_0];
}
}
/* Requantize and clamp output to provided range */
acc_0 = arm_nn_requantize(acc_0, output_mult[idx_out_ch], output_shift[idx_out_ch]);
acc_0 += output_offset;
acc_0 = MAX(acc_0, output_activation_min);
acc_0 = MIN(acc_0, output_activation_max);
output[i_out++] = acc_0;
}
}
}
}
/* Advance to the next batch */
input += (input_x * input_y * input_ch);
}
}
/*
* Basic s8 depthwise convolution function.
*
* Refer header file for details.
* Optimization using DSP extension is not available for the generic case where channel multiplier is > 1.
*
*/
arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
const uint16_t dilation_x = dw_conv_params->dilation.w;
const uint16_t dilation_y = dw_conv_params->dilation.h;
(void)dw_conv_params->dilation;
(void)bias_dims;
(void)ctx;
if (dw_conv_params->ch_mult % 4 == 0 && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
dw_conv_params->dilation.h == 1)
{
depthwise_conv_s8_mult_4(input,
input_dims->w,
input_dims->h,
input_dims->c,
kernel,
output_dims->c,
dw_conv_params->ch_mult,
filter_dims->w,
filter_dims->h,
dw_conv_params->padding.w,
dw_conv_params->padding.h,
dw_conv_params->stride.w,
dw_conv_params->stride.h,
bias,
output,
quant_params->shift,
quant_params->multiplier,
output_dims->w,
output_dims->h,
dw_conv_params->output_offset,
dw_conv_params->input_offset,
dw_conv_params->activation.min,
dw_conv_params->activation.max);
}
else
{
depthwise_conv_s8_generic(input,
input_dims->n,
input_dims->w,
input_dims->h,
input_dims->c,
kernel,
output_dims->c,
dw_conv_params->ch_mult,
filter_dims->w,
filter_dims->h,
dw_conv_params->padding.w,
dw_conv_params->padding.h,
dw_conv_params->stride.w,
dw_conv_params->stride.h,
bias,
output,
quant_params->shift,
quant_params->multiplier,
output_dims->w,
output_dims->h,
dw_conv_params->output_offset,
dw_conv_params->input_offset,
dw_conv_params->activation.min,
dw_conv_params->activation.max,
dilation_x,
dilation_y);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,433 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_s8_opt.c
* Description: Optimized s8 depthwise separable convolution function for
* channel multiplier of 1.
*
* $Date: January 26, 2021
* $Revision: V.2.0.3
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel
*
* Refer prototype header file for details.
*
*/
arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
const int32_t input_ch = input_dims->c;
const int32_t output_ch = output_dims->c;
/* Check input constraints input_ch == output_ch */
if (input_ch != output_ch)
{
return ARM_MATH_SIZE_MISMATCH;
}
if (ctx->buf == NULL && arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims) > 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
#ifdef ARM_MATH_DSP
const int32_t input_x = input_dims->w;
const int32_t input_y = input_dims->h;
const int32_t kernel_x = filter_dims->w;
const int32_t kernel_y = filter_dims->h;
const int32_t pad_x = dw_conv_params->padding.w;
const int32_t pad_y = dw_conv_params->padding.h;
const int32_t stride_x = dw_conv_params->stride.w;
const int32_t stride_y = dw_conv_params->stride.h;
const int32_t *output_shift = quant_params->shift;
const int32_t *output_mult = quant_params->multiplier;
const int32_t output_x = output_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_offset = dw_conv_params->output_offset;
const int32_t input_offset = dw_conv_params->input_offset;
const int32_t output_activation_min = dw_conv_params->activation.min;
const int32_t output_activation_max = dw_conv_params->activation.max;
q15_t *buffer_a = (q15_t *)ctx->buf;
#ifdef ARM_MATH_MVEI
(void)bias_dims;
/* Generate two columns from the input tensor */
q7_t *lhs_buffer = (q7_t *)buffer_a;
q7_t *out = output;
int padded = 0;
int buffer_count = 0;
const int32_t kernel_size = kernel_x * kernel_y;
/* This part implements the im2col function */
for (int i_out_y = 0, base_idx_y = -pad_y; i_out_y < output_y; base_idx_y += stride_y, i_out_y++)
{
for (int i_out_x = 0, base_idx_x = -pad_x; i_out_x < output_x; base_idx_x += stride_x, i_out_x++)
{
for (int i_ker_y = base_idx_y; i_ker_y < base_idx_y + kernel_y; i_ker_y++)
{
for (int i_ker_x = base_idx_x; i_ker_x < base_idx_x + kernel_x; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
{
arm_memset_q7(lhs_buffer, (int8_t)-input_offset, (uint32_t)input_ch);
padded = 1;
}
else
{
arm_memcpy_q7(lhs_buffer, input + (i_ker_y * input_x + i_ker_x) * input_ch, (uint32_t)input_ch);
}
lhs_buffer += input_ch;
}
}
buffer_count++;
if (buffer_count == 4)
{
lhs_buffer = (q7_t *)buffer_a;
if (padded == 0)
{
out = arm_nn_depthwise_conv_nt_t_s8(lhs_buffer,
kernel,
input_offset,
input_ch,
output_shift,
output_mult,
output_offset,
output_activation_min,
output_activation_max,
kernel_size,
bias,
out);
}
else
{
out = arm_nn_depthwise_conv_nt_t_padded_s8(lhs_buffer,
kernel,
input_offset,
input_ch,
output_shift,
output_mult,
output_offset,
output_activation_min,
output_activation_max,
kernel_size,
bias,
out);
padded = 0;
}
buffer_count = 0;
}
}
}
/* Handle left over buffers */
lhs_buffer = (q7_t *)buffer_a;
for (int i_buf = 0; i_buf < buffer_count; i_buf++)
{
int32_t loop_count = (input_ch + 3) / 4;
int32_t num_ch_to_process = input_ch;
for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count; num_ch_to_process -= 4, offset += 4, i_loop_cnt++)
{
const int8_t *col_0 = lhs_buffer + (kernel_size * input_ch * i_buf) + offset;
const int8_t *row_0 = kernel + offset;
int32x4_t out_0 = vldrwq_s32(&bias[offset]);
for (int i_ker = 0; i_ker < kernel_size; i_ker++)
{
const int32x4_t ker_0 = vldrbq_s32(row_0);
int32x4_t ip_0 = vldrbq_s32(col_0);
ip_0 = vaddq_n_s32(ip_0, input_offset);
out_0 += vmulq_s32(ip_0, ker_0);
col_0 += input_ch;
row_0 += input_ch;
}
const int32x4_t mult = vldrwq_s32(&output_mult[offset]);
const int32x4_t shift = vldrwq_s32(&output_shift[offset]);
out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
out_0 = vaddq_n_s32(out_0, output_offset);
out_0 = vmaxq_s32(out_0, vdupq_n_s32(output_activation_min));
out_0 = vminq_s32(out_0, vdupq_n_s32(output_activation_max));
mve_pred16_t p = vctp32q((uint32_t)num_ch_to_process);
vstrbq_p_s32(out, out_0, p);
out += 4;
}
const int tail_ch = input_ch & 0x3;
if (tail_ch != 0)
{
out -= (4 - tail_ch);
}
}
#else // ARM_MATH_DSP
(void)bias_dims;
/* Run the following code in cores using DSP extension */
q15_t *const col_buffer_start = buffer_a;
q15_t *col_buffer = col_buffer_start;
const int32_t *const bias_start_pos = bias;
const q31_t *const out_mult_start_pos = output_mult;
const q31_t *const out_shift_start_pos = output_shift;
uint16_t row_count;
uint16_t row_shift;
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
/* Out of bounds is only considered for the y axis as it provides a contiguous zero'ing opportunity than
along the x axis */
const int ker_y_start = MAX(0, -base_idx_y);
/* Condition for kernel end dimension: (base_idx_y + ker_y_end) < input_y */
const int ker_y_end = MIN(kernel_y, input_y - base_idx_y);
int32_t index = 0;
if (ker_y_start != 0)
{
memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(q15_t));
index += (kernel_x * input_ch) * ker_y_start;
}
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + i_ker_y;
for (int i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
{
const int32_t idx_x = base_idx_x + i_ker_x;
if (idx_x < 0 || idx_x >= input_x)
{
memset(&col_buffer[index], 0, input_ch * sizeof(q15_t));
}
else
{
arm_q7_to_q15_with_offset((q7_t *)input + (idx_y * input_x + idx_x) * input_ch,
&col_buffer[index],
input_ch,
input_offset);
}
index += input_ch;
}
}
const int diff = kernel_y - ker_y_end;
if (diff != 0)
{
memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(q15_t));
}
row_count = output_ch / 4;
row_shift = 0;
bias = bias_start_pos;
output_mult = out_mult_start_pos;
output_shift = out_shift_start_pos;
while (row_count)
{
q31_t sum = *bias++;
q31_t sum_2 = *bias++;
q31_t sum_3 = *bias++;
q31_t sum_4 = *bias++;
uint16_t col_count = (kernel_x * kernel_y) / 2;
q15_t *col_pos = col_buffer_start + row_shift;
const q7_t *row_pos = kernel + row_shift;
row_shift += 4;
while (col_count)
{
/* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to
use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */
/* Note: variable names can be improved here to align with rows and columns. */
q31_t ip_a1, ip_a2, ip_b1, ip_b2, op_a, op_b, op_c;
/* Read 4 weights */
ip_b1 = arm_nn_read_q7x4(row_pos);
ip_a1 = arm_nn_read_q7x4(row_pos + input_ch);
op_a = arm_nn_read_q15x2(col_pos);
op_b = arm_nn_read_q15x2(col_pos + input_ch);
ip_a2 = __SXTB16(ip_b1);
ip_b1 = __SXTB16(__ROR(ip_b1, 8));
ip_b2 = __SXTB16(ip_a1);
ip_a1 = __SXTB16(__ROR(ip_a1, 8));
op_c = __PKHBT(op_b, op_a, 16);
op_a = __PKHTB(op_b, op_a, 16);
op_b = __PKHBT(ip_b2, ip_a2, 16);
sum = __SMLAD(op_c, op_b, sum);
op_b = __PKHBT(ip_b1, ip_a1, 16);
sum_2 = __SMLAD(op_a, op_b, sum_2);
op_a = arm_nn_read_q15x2(col_pos + 2);
op_b = arm_nn_read_q15x2(col_pos + input_ch + 2);
op_c = __PKHBT(op_b, op_a, 16);
op_a = __PKHTB(op_b, op_a, 16);
op_b = __PKHTB(ip_a2, ip_b2, 16);
sum_3 = __SMLAD(op_c, op_b, sum_3);
op_b = __PKHTB(ip_a1, ip_b1, 16);
sum_4 = __SMLAD(op_a, op_b, sum_4);
row_pos += input_ch << 1;
col_pos += input_ch << 1;
col_count--;
}
col_count = (kernel_x * kernel_y) & 0x1;
while (col_count)
{
sum += row_pos[0] * col_pos[0];
sum_2 += row_pos[1] * col_pos[1];
sum_3 += row_pos[2] * col_pos[2];
sum_4 += row_pos[3] * col_pos[3];
row_pos += input_ch;
col_pos += input_ch;
col_count--;
}
sum = arm_nn_requantize(sum, *output_mult++, *output_shift++);
sum += output_offset;
sum = MAX(sum, output_activation_min);
sum = MIN(sum, output_activation_max);
*output++ = (q7_t)sum;
sum_2 = arm_nn_requantize(sum_2, *output_mult++, *output_shift++);
sum_2 += output_offset;
sum_2 = MAX(sum_2, output_activation_min);
sum_2 = MIN(sum_2, output_activation_max);
*output++ = (q7_t)sum_2;
sum_3 = arm_nn_requantize(sum_3, *output_mult++, *output_shift++);
sum_3 += output_offset;
sum_3 = MAX(sum_3, output_activation_min);
sum_3 = MIN(sum_3, output_activation_max);
*output++ = (q7_t)sum_3;
sum_4 = arm_nn_requantize(sum_4, *output_mult++, *output_shift++);
sum_4 += output_offset;
sum_4 = MAX(sum_4, output_activation_min);
sum_4 = MIN(sum_4, output_activation_max);
*output++ = (q7_t)sum_4;
row_count--;
}
row_count = output_ch & 0x3;
while (row_count)
{
q15_t *col_pos = col_buffer_start + row_shift;
const q7_t *row_pos = kernel + row_shift;
q31_t sum = *bias++;
const uint16_t col_count = (kernel_x * kernel_y);
row_shift += 1;
for (int i = 0; i < col_count; i++)
{
sum += row_pos[i * input_ch] * col_pos[i * input_ch];
}
sum = arm_nn_requantize(sum, *output_mult++, *output_shift++);
sum += output_offset;
sum = MAX(sum, output_activation_min);
sum = MIN(sum, output_activation_max);
*output++ = (q7_t)sum;
row_count--;
}
// clear counter and pointers
col_buffer = col_buffer_start;
}
}
#endif
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
return arm_depthwise_conv_s8(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
kernel,
bias_dims,
bias,
output_dims,
output);
#endif /* ARM_MATH_MVEI | ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
#if defined(ARM_MATH_MVEI)
/* The + 4 accounts for out of bounds read of the lhs buffers in the *_nt_t_* functions. */
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t) + 4;
#elif defined(ARM_MATH_DSP)
return (input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
#else
(void)input_dims;
(void)filter_dims;
return 0;
#endif
}
/**
* @} end of NNConv group
*/

View File

@ -1,336 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_u8_basic_ver1.c
* Description: u8 depthwise convolution function
*
* $Date: 09. October 2020
* $Revision: V.1.1.1
*
* Target : Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
static void depthwise_conv_u8_mult_4(const uint8_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const uint8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int32_t *bias,
uint8_t *output,
const int32_t output_shift,
const int32_t output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t filter_offset,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch;
++in_ch, out_ch += ch_mult)
{
for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
{
int32_t out_buff[4];
out_buff[0] = 0;
out_buff[1] = 0;
out_buff[2] = 0;
out_buff[3] = 0;
for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
{
int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w);
++ker_w, ker_idx += output_ch)
{
int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset;
out_buff[0] += in_val * (kernel[ker_idx + 0 + mult_tile] + filter_offset);
out_buff[1] += in_val * (kernel[ker_idx + 1 + mult_tile] + filter_offset);
out_buff[2] += in_val * (kernel[ker_idx + 2 + mult_tile] + filter_offset);
out_buff[3] += in_val * (kernel[ker_idx + 3 + mult_tile] + filter_offset);
}
}
if (bias != NULL)
{
out_buff[0] += bias[out_ch + 0 + mult_tile];
out_buff[1] += bias[out_ch + 1 + mult_tile];
out_buff[2] += bias[out_ch + 2 + mult_tile];
out_buff[3] += bias[out_ch + 3 + mult_tile];
}
out_buff[0] = arm_nn_requantize(out_buff[0], output_mult, output_shift);
out_buff[1] = arm_nn_requantize(out_buff[1], output_mult, output_shift);
out_buff[2] = arm_nn_requantize(out_buff[2], output_mult, output_shift);
out_buff[3] = arm_nn_requantize(out_buff[3], output_mult, output_shift);
out_buff[0] += output_offset;
out_buff[1] += output_offset;
out_buff[2] += output_offset;
out_buff[3] += output_offset;
out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max);
out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max);
out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max);
out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max);
output[out_idx++] = (uint8_t)out_buff[0];
output[out_idx++] = (uint8_t)out_buff[1];
output[out_idx++] = (uint8_t)out_buff[2];
output[out_idx++] = (uint8_t)out_buff[3];
}
}
}
}
}
static void depthwise_conv_u8_generic(const uint8_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const uint8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int32_t *bias,
uint8_t *output,
const int32_t output_shift,
const int32_t output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t filter_offset,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
(void)output_ch;
int i_out = 0;
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
{
const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
int32_t acc_0;
/* Condition for kernel start dimension: (base_idx_<x,y> + ker_<x,y>_start) >= 0 */
const int ker_y_start = MAX(0, -base_idx_y);
const int ker_x_start = MAX(0, -base_idx_x);
/* Condition for kernel end dimension: (base_idx_<x,y> + ker_<x,y>_end) < input_<x,y> */
const int ker_y_end = MIN(kernel_y, input_y - base_idx_y);
const int ker_x_end = MIN(kernel_x, input_x - base_idx_x);
acc_0 = 0;
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + i_ker_y;
for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t idx_x = base_idx_x + i_ker_x;
int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
acc_0 += (input[idx_0] + input_offset) * (kernel[ker_idx_0] + filter_offset);
}
}
if (bias != NULL)
{
acc_0 += bias[idx_out_ch];
}
/* Requantize and clamp output to provided range */
acc_0 = arm_nn_requantize(acc_0, output_mult, output_shift);
acc_0 += output_offset;
acc_0 = MAX(acc_0, output_activation_min);
acc_0 = MIN(acc_0, output_activation_max);
output[i_out++] = acc_0;
}
}
}
}
}
/**
* @brief uint8 depthwise convolution function with asymmetric quantization
*
* @param[in] input Pointer to input tensor
* @param[in] input_x Width of input tensor
* @param[in] input_y Height of input tensor
* @param[in] input_ch Channels in input tensor
* @param[in] kernel Pointer to kernel weights
* @param[in] kernel_x Width of kernel
* @param[in] kernel_y Height of kernel
* @param[in] ch_mult Number of channel multiplier
* @param[in] pad_x Padding sizes x
* @param[in] pad_y Padding sizes y
* @param[in] stride_x Convolution stride along the width
* @param[in] stride_y Convolution stride along the height
* @param[in] dilation_x Dilation along width. Not used and intended for future enhancement.
* @param[in] dilation_y Dilation along height. Not used and intended for future enhancement.
* @param[in] bias Pointer to optional bias values. If no bias is
* available, NULL is expected
* @param[in] input_offset Input tensor zero offset
* @param[in] filter_offset Kernel tensor zero offset
* @param[in] output_offset Output tensor zero offset
* @param[in,out] output Pointer to output tensor
* @param[in] output_x Width of output tensor
* @param[in] output_y Height of output tensor
* @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255}
* @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255}
* @param[in] output_shift Amount of right-shift for output
* @param[in] output_mult Output multiplier for requantization
* @return The function returns one of the following
* <code>ARM_MATH_SIZE_MISMATCH</code> - Not supported dimension of tensors
* <code>ARM_MATH_SUCCESS</code> - Successful operation
* <code>ARM_MATH_ARGUMENT_ERROR</code> - Implementation not available
*
*
*/
arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_ch,
const uint8_t *kernel,
const uint16_t kernel_x,
const uint16_t kernel_y,
const int16_t ch_mult,
const int16_t pad_x,
const int16_t pad_y,
const int16_t stride_x,
const int16_t stride_y,
const int16_t dilation_x,
const int16_t dilation_y,
const int32_t *bias,
const int32_t input_offset,
const int32_t filter_offset,
const int32_t output_offset,
uint8_t *output,
const uint16_t output_x,
const uint16_t output_y,
const int32_t output_activation_min,
const int32_t output_activation_max,
const int32_t output_shift,
const int32_t output_mult)
{
(void)dilation_x;
(void)dilation_y;
if (ch_mult % 4 == 0)
{
depthwise_conv_u8_mult_4(input,
input_x,
input_y,
input_ch,
kernel,
ch_mult * input_ch,
ch_mult,
kernel_x,
kernel_y,
pad_x,
pad_y,
stride_x,
stride_y,
bias,
output,
output_shift,
output_mult,
output_x,
output_y,
output_offset,
input_offset,
filter_offset,
output_activation_min,
output_activation_max);
}
else
{
depthwise_conv_u8_generic(input,
input_x,
input_y,
input_ch,
kernel,
ch_mult * input_ch,
ch_mult,
kernel_x,
kernel_y,
pad_x,
pad_y,
stride_x,
stride_y,
bias,
output,
output_shift,
output_mult,
output_x,
output_y,
output_offset,
input_offset,
filter_offset,
output_activation_min,
output_activation_max);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,135 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_wrapper_s8.c
* Description: Wrapper API to select appropriate depthwise conv API based
* on dimensions.
*
* $Date: 20. Dec 2021
* $Revision: V.1.4.0
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* s8 Depthwise conv wrapper function
*
* Refer header file for details.
*
*/
arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *filter,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
arm_status status = ARM_MATH_SUCCESS;
if (1 == dw_conv_params->ch_mult && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
dw_conv_params->dilation.h == 1)
{
#if !defined(ARM_MATH_MVEI)
if ((filter_dims->w == 3) && (filter_dims->h == 3) && (dw_conv_params->padding.h <= 1) &&
(dw_conv_params->padding.w <= 1))
{
status = arm_depthwise_conv_3x3_s8(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
filter,
bias_dims,
bias,
output_dims,
output);
}
else
#endif
{
status = arm_depthwise_conv_s8_opt(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
filter,
bias_dims,
bias,
output_dims,
output);
}
}
else
{
status = arm_depthwise_conv_s8(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
filter,
bias_dims,
bias,
output_dims,
output);
}
/* Return to application */
return status;
}
int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims)
{
(void)dw_conv_params;
int32_t size = 0;
if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
dw_conv_params->dilation.h == 1)
{
size = arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims);
}
return size;
}
/**
* @} end of NNConv group
*/

View File

@ -1,422 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_separable_conv_HWC_q7.c
* Description: Q7 depthwise separable convolution function
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Q7 depthwise separable convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimension
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in equals ch_im_out
*
* Implementation:
* There are 3 nested loop here:
* Inner loop: calculate each output value with MAC instruction over an accumulator
* Mid loop: loop over different output channel
* Outer loop: loop over different output (x, y)
*/
arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x;
int16_t i_ker_y, i_ker_x;
q7_t *colBuffer = (q7_t *)bufferA;
q7_t *pBuffer = colBuffer;
const q7_t *pBias = bias;
q7_t *pOut = Im_out;
uint16_t rowCnt;
uint16_t row_shift;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
/* we first do im2col here */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q7(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, ch_im_in);
}
else
{
/* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
*/
memcpy(pBuffer, (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* we will do the computation here for each channel */
rowCnt = ch_im_out >> 2;
row_shift = 0;
pBias = bias;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel * dim_kernel) >> 1;
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
row_shift += 4;
#ifdef USE_INTRINSIC
#ifndef ARM_MATH_BIG_ENDIAN
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHTB(opB, inB1, 16);
inB1 = __PKHBT(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHTB(opB, inA1, 16);
inA1 = __PKHBT(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum3 = __SMLAD(opA, opB, sum3);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum4 = __SMLAD(opA, opB, sum4);
colCnt--;
}
#else
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHBT(opB, inB1, 16);
inB1 = __PKHTB(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHBT(opB, inA1, 16);
inA1 = __PKHTB(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum4 = __SMLAD(opA, opB, sum4);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum3 = __SMLAD(opA, opB, sum3);
colCnt--;
}
#endif /* ARM_MATH_BIG_ENDIAN */
#else
#ifndef ARM_MATH_BIG_ENDIAN
/*
* r0 r1 r2 r3 r4 r5
* inA1, inA2, inB1, inB2, opA, opB
*/
asm volatile("COL_LOOP_%=:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhtb r3, r5, r2, ASR #16\n"
"pkhbt r2, r2, r5, LSL #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhtb r1, r5, r0, ASR #16\n"
"pkhbt r0, r0, r5, LSL #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum], r4, r5, %[sum]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
: "r0", "r1", "r2", "r3", "r4", "r5");
#else
/*
* r0 r1 r2 r3 r4 r5
* inA1, inA2, inB1, inB2, opA, opB
*/
asm volatile("COL_LOOP_%=:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhbt r3, r5, r2, LSL #16\n"
"pkhtb r2, r2, r5, ASR #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhbt r1, r5, r0, LSL #16\n"
"pkhtb r0, r0, r5, ASR #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum], r4, r5, %[sum]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
: "r0", "r1", "r2", "r3", "r4", "r5");
#endif /* ARM_MATH_BIG_ENDIAN */
#endif /* USE_INTRINSIC */
colCnt = (dim_kernel * dim_kernel) & 0x1;
while (colCnt)
{
union arm_nnword inA, inB;
inA.word = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inB.word = arm_nn_read_q7x4(pB);
pB += ch_im_in;
sum += inA.bytes[0] * inB.bytes[0];
sum2 += inA.bytes[1] * inB.bytes[1];
sum3 += inA.bytes[2] * inB.bytes[2];
sum4 += inA.bytes[3] * inB.bytes[3];
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
rowCnt--;
}
rowCnt = ch_im_out & 0x3;
while (rowCnt)
{
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel * dim_kernel);
row_shift += 1;
while (colCnt)
{
q7_t A1 = *pA;
q7_t B1 = *pB;
pA += ch_im_in;
pB += ch_im_in;
sum += A1 * B1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
rowCnt--;
}
/* clear counter and pointers */
pBuffer = colBuffer;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i_out_y, i_out_x, i_ch_out, i_ker_x, i_ker_y;
int conv_out;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
{
// for each output
conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
for (i_ker_y = 0; i_ker_y < dim_kernel; i_ker_y++)
{
for (i_ker_x = 0; i_ker_x < dim_kernel; i_ker_x++)
{
int in_row = stride * i_out_y + i_ker_y - padding;
int in_col = stride * i_out_x + i_ker_x - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + i_ch_out] *
wt[(i_ker_y * dim_kernel + i_ker_x) * ch_im_out + i_ch_out];
}
}
}
Im_out[(i_out_y * dim_im_out + i_out_x) * ch_im_out + i_ch_out] =
(q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,427 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_separable_conv_HWC_q7_nonsquare.c
* Description: Q7 depthwise separable convolution function (non-square shape)
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Q7 depthwise separable convolution function (non-square shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimension x
* @param[in] dim_im_in_y input tensor dimension y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding sizes x
* @param[in] padding_y padding sizes y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is the version with full list of optimization tricks, but with
* some constraints:
* ch_im_in is equal to ch_im_out
*
*/
arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
/*
* Implementation:
* There are 3 nested loop here:
* Inner loop: calculate each output value with MAC instruction over an accumulator
* Mid loop: loop over different output channel
* Outer loop: loop over different output (x, y)
*
*/
int16_t i_out_y, i_out_x;
int16_t i_ker_y, i_ker_x;
q7_t *colBuffer = (q7_t *)bufferA;
q7_t *pBuffer = colBuffer;
const q7_t *pBias = bias;
q7_t *pOut = Im_out;
uint16_t rowCnt;
uint16_t row_shift;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* we first do im2col here */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q7(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, ch_im_in);
}
else
{
/* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer,
* ch_im_in); */
memcpy(pBuffer, (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* we will do the computation here for each channel */
rowCnt = ch_im_out >> 2;
row_shift = 0;
pBias = bias;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel_x * dim_kernel_y) >> 1;
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
row_shift += 4;
#ifdef USE_INTRINSIC
#ifndef ARM_MATH_BIG_ENDIAN
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHTB(opB, inB1, 16);
inB1 = __PKHBT(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHTB(opB, inA1, 16);
inA1 = __PKHBT(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum3 = __SMLAD(opA, opB, sum3);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum4 = __SMLAD(opA, opB, sum4);
colCnt--;
}
#else
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHBT(opB, inB1, 16);
inB1 = __PKHTB(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHBT(opB, inA1, 16);
inA1 = __PKHTB(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum4 = __SMLAD(opA, opB, sum4);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum3 = __SMLAD(opA, opB, sum3);
colCnt--;
}
#endif /* ARM_MATH_BIG_ENDIAN */
#else
#ifndef ARM_MATH_BIG_ENDIAN
// r0 r1 r2 r3 r4 r5
// inA1, inA2, inB1, inB2, opA, opB
asm volatile("COL_LOOP:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhtb r3, r5, r2, ASR #16\n"
"pkhbt r2, r2, r5, LSL #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhtb r1, r5, r0, ASR #16\n"
"pkhbt r0, r0, r5, LSL #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum], r4, r5, %[sum]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
: "r0", "r1", "r2", "r3", "r4", "r5");
#else
// r0 r1 r2 r3 r4 r5
// inA1, inA2, inB1, inB2, opA, opB
asm volatile("COL_LOOP:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhbt r3, r5, r2, LSL #16\n"
"pkhtb r2, r2, r5, ASR #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhbt r1, r5, r0, LSL #16\n"
"pkhtb r0, r0, r5, ASR #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum], r4, r5, %[sum]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
: "r0", "r1", "r2", "r3", "r4", "r5");
#endif /*ARM_MATH_BIG_ENDIAN */
#endif /* USE_INTRINSIC */
colCnt = (dim_kernel_x * dim_kernel_y) & 0x1;
while (colCnt)
{
union arm_nnword inA, inB;
inA.word = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inB.word = arm_nn_read_q7x4(pB);
pB += ch_im_in;
sum += inA.bytes[0] * inB.bytes[0];
sum2 += inA.bytes[1] * inB.bytes[1];
sum3 += inA.bytes[2] * inB.bytes[2];
sum4 += inA.bytes[3] * inB.bytes[3];
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
rowCnt--;
}
rowCnt = ch_im_out & 0x3;
while (rowCnt)
{
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel_x * dim_kernel_y);
row_shift += 1;
while (colCnt)
{
q7_t A1 = *pA;
q7_t B1 = *pB;
pA += ch_im_in;
pB += ch_im_in;
sum += A1 * B1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
rowCnt--;
}
// clear counter and pointers
pBuffer = colBuffer;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i_out_y, i_out_x, i_ch_out;
int i_ker_y, i_ker_x;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
{
// for each output
int conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
for (i_ker_y = 0; i_ker_y < dim_kernel_y; i_ker_y++)
{
for (i_ker_x = 0; i_ker_x < dim_kernel_x; i_ker_x++)
{
int in_row = stride_y * i_out_y + i_ker_y - padding_y;
int in_col = stride_x * i_out_x + i_ker_x - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + i_ch_out] *
wt[(i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out + i_ch_out];
}
}
}
Im_out[(i_out_y * dim_im_out_x + i_out_x) * ch_im_out + i_ch_out] =
(q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@ -1,218 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_depthwise_conv_s8_core.c
* Description: Depthwise convolution on im2col buffers.
*
* $Date: 09. October 2020
* $Revision: V.1.0.4
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/*
* Depthwise conv on an im2col buffer where the input channel equals
* output channel.
*
* Refer header file for details.
*
*/
q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
const q15_t *col,
const uint16_t num_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t kernel_size,
const int32_t *const output_bias,
q7_t *out)
{
#if defined(ARM_MATH_MVEI)
int32_t ch_per_loop = num_ch / 4;
const int32_t *bias = output_bias;
int8_t *out_tmp = out;
int32_t idx = 0;
while (ch_per_loop > 0)
{
int32x4_t ip_0;
int32x4_t ip_1;
int32_t ker_loop = kernel_size / 3;
int32x4_t out_0 = vldrwq_s32(bias);
int32x4_t out_1 = out_0;
bias += 4;
const int32_t offset = idx * 4;
const int8_t *row_0 = row + offset;
const int16_t *col_0 = col + offset;
const int16_t *col_1 = col + kernel_size * num_ch + offset;
int32x4_t ker_0 = vldrbq_s32(row_0);
while (ker_loop > 0)
{
const int8_t *row_1 = row_0 + num_ch;
const int8_t *row_2 = row_0 + 2 * num_ch;
const int32x4_t ker_1 = vldrbq_s32(row_1);
const int32x4_t ker_2 = vldrbq_s32(row_2);
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
col_0 += num_ch;
col_1 += num_ch;
out_0 += vmulq_s32(ip_0, ker_0);
out_1 += vmulq_s32(ip_1, ker_0);
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
col_0 += num_ch;
col_1 += num_ch;
out_0 += vmulq_s32(ip_0, ker_1);
out_1 += vmulq_s32(ip_1, ker_1);
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
col_0 += num_ch;
col_1 += num_ch;
out_0 += vmulq_s32(ip_0, ker_2);
out_1 += vmulq_s32(ip_1, ker_2);
row_0 += 3 * num_ch;
ker_0 = vldrbq_s32(row_0);
ker_loop--;
}
idx++;
/* Handle tail kernel elements */
ker_loop = kernel_size - ((kernel_size / 3) * 3);
while (ker_loop > 0)
{
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
out_0 += vmulq_s32(ip_0, ker_0);
out_1 += vmulq_s32(ip_1, ker_0);
col_0 += num_ch;
col_1 += num_ch;
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
row_0 += num_ch;
ker_0 = vldrbq_s32(row_0);
ker_loop--;
}
const int32x4_t mult = vldrwq_s32(out_mult);
const int32x4_t shift = vldrwq_s32(out_shift);
out_mult += 4;
out_shift += 4;
out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
out_0 = vaddq_n_s32(out_0, out_offset);
out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
vstrbq_s32(out_tmp, out_0);
out_1 = vaddq_n_s32(out_1, out_offset);
out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
vstrbq_s32(out_tmp + num_ch, out_1);
out_tmp += 4;
ch_per_loop--;
}
int32_t tail_ch = num_ch & 3;
if (tail_ch != 0)
{
int32_t ch_idx = (num_ch & ~3);
int32x4_t col_0_sum;
int32x4_t col_1_sum;
const int32_t single_buffer_size = kernel_size * num_ch;
for (int i = 0; i < tail_ch; i++)
{
const int16_t *col_pos_0 = col + ch_idx;
const int16_t *col_pos_1 = col_pos_0 + single_buffer_size;
const int8_t *row_pos = row + ch_idx;
int32_t sum_0 = bias[i];
int32_t sum_1 = bias[i];
for (int j = 0; j < kernel_size; j++)
{
const int8_t row_val = row_pos[j * num_ch];
sum_0 += row_val * col_pos_0[j * num_ch];
sum_1 += row_val * col_pos_1[j * num_ch];
}
col_0_sum[i] = sum_0;
col_1_sum[i] = sum_1;
ch_idx++;
}
const mve_pred16_t p = vctp32q((uint32_t)tail_ch);
const int32x4_t mult = vldrwq_z_s32(out_mult, p);
const int32x4_t shift = vldrwq_z_s32(out_shift, p);
col_0_sum = arm_requantize_mve_32x4(col_0_sum, mult, shift);
col_1_sum = arm_requantize_mve_32x4(col_1_sum, mult, shift);
col_0_sum = vaddq_n_s32(col_0_sum, out_offset);
col_0_sum = vmaxq_s32(col_0_sum, vdupq_n_s32(activation_min));
col_0_sum = vminq_s32(col_0_sum, vdupq_n_s32(activation_max));
vstrbq_p_s32(out_tmp, col_0_sum, p);
col_1_sum = vaddq_n_s32(col_1_sum, out_offset);
col_1_sum = vmaxq_s32(col_1_sum, vdupq_n_s32(activation_min));
col_1_sum = vminq_s32(col_1_sum, vdupq_n_s32(activation_max));
vstrbq_p_s32(out_tmp + num_ch, col_1_sum, p);
out_tmp += tail_ch;
}
return out_tmp + num_ch;
#else
(void)row;
(void)col;
(void)num_ch;
(void)out_shift;
(void)out_mult;
(void)out_offset;
(void)activation_min;
(void)activation_max;
(void)kernel_size;
(void)output_bias;
(void)out;
return NULL;
#endif
}

View File

@ -1,186 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_kernel_q7_q15.c
* Description: Matrix-multiplication function for convolution
*
* $Date: January 26, 2021
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @brief Matrix-multiplication function for convolution.
*
* @details Refer to header file for details.
*
*/
q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t *pA,
const q15_t *pInBuffer,
const uint16_t ch_im_out,
const uint16_t numCol_A,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q7_t *pOut)
{
#if defined(ARM_MATH_DSP)
/* set up the second output pointers */
q7_t *pOut2 = pOut + ch_im_out;
const q7_t *pBias = bias;
uint16_t rowCnt = ch_im_out >> 1;
/* this loop over rows in A */
while (rowCnt)
{
/* setup pointers for B */
const q15_t *pB = pInBuffer;
const q15_t *pB2 = pB + numCol_A;
/* align the second pointer for A */
const q7_t *pA2 = pA + numCol_A;
/* init the sum with bias */
q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = numCol_A >> 2;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA11, inA12, inA21, inA22;
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
pA = read_and_pad(pA, &inA11, &inA12);
pA2 = read_and_pad(pA2, &inA21, &inA22);
sum = __SMLAD(inA11, inB1, sum);
sum2 = __SMLAD(inA11, inB2, sum2);
sum3 = __SMLAD(inA21, inB1, sum3);
sum4 = __SMLAD(inA21, inB2, sum4);
inB1 = arm_nn_read_q15x2_ia(&pB);
inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA12, inB1, sum);
sum2 = __SMLAD(inA12, inB2, sum2);
sum3 = __SMLAD(inA22, inB1, sum3);
sum4 = __SMLAD(inA22, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = numCol_A & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
q7_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
*pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
*pOut2++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
/* skip the row computed with A2 */
pA += numCol_A;
rowCnt--;
} /* for over ch_im_out */
/* compute left-over row if any */
if (ch_im_out & 0x1)
{
/* setup pointers for B */
const q15_t *pB = pInBuffer;
const q15_t *pB2 = pB + numCol_A;
/* load the bias */
q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = numCol_A >> 2;
while (colCnt)
{
q31_t inA11, inA12;
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
pA = read_and_pad(pA, &inA11, &inA12);
sum = __SMLAD(inA11, inB1, sum);
sum2 = __SMLAD(inA11, inB2, sum2);
inB1 = arm_nn_read_q15x2_ia(&pB);
inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA12, inB1, sum);
sum2 = __SMLAD(inA12, inB2, sum2);
colCnt--;
}
colCnt = numCol_A & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
}
pOut += ch_im_out;
/* return the new output pointer with offset */
return pOut;
#else
(void)pA;
(void)pInBuffer;
(void)ch_im_out;
(void)numCol_A;
(void)bias_shift;
(void)out_shift;
(void)bias;
(void)pOut;
/* To be completed */
return NULL;
#endif /* ARM_MATH_DSP */
}

View File

@ -1,137 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_kernel_q7_q15_reordered.c
* Description: Matrix-multiplication function for convolution with reordered columns
*
* $Date: January 26, 2021
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @brief Matrix-multiplication function for convolution with re-ordered input.
*
* @details Refer to header file for details.
*
*/
q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA,
const q15_t *pInBuffer,
const uint16_t ch_im_out,
const uint16_t numCol_A,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q7_t *pOut)
{
#if defined(ARM_MATH_DSP)
/* set up the second output pointers */
q7_t *pOut2 = pOut + ch_im_out;
int i;
/* this loop over rows in A */
for (i = 0; i < ch_im_out; i += 2)
{
/* setup pointers for B */
const q15_t *pB = pInBuffer;
const q15_t *pB2 = pB + numCol_A;
/* align the second pointer for A */
const q7_t *pA2 = pA + numCol_A;
/* init the sum with bias */
q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(bias[i + 1]) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = numCol_A >> 2;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA11, inA12, inA21, inA22;
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
pA = read_and_pad_reordered(pA, &inA11, &inA12);
pA2 = read_and_pad_reordered(pA2, &inA21, &inA22);
sum = __SMLAD(inA11, inB1, sum);
sum2 = __SMLAD(inA11, inB2, sum2);
sum3 = __SMLAD(inA21, inB1, sum3);
sum4 = __SMLAD(inA21, inB2, sum4);
inB1 = arm_nn_read_q15x2_ia(&pB);
inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA12, inB1, sum);
sum2 = __SMLAD(inA12, inB2, sum2);
sum3 = __SMLAD(inA22, inB1, sum3);
sum4 = __SMLAD(inA22, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = numCol_A & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
q7_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
*pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
*pOut2++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
/* skip the row computed with A2 */
pA += numCol_A;
} /* for over ch_im_out */
pOut += ch_im_out;
/* return the new output pointer with offset */
return pOut;
#else
(void)pA;
(void)pInBuffer;
(void)ch_im_out;
(void)numCol_A;
(void)bias_shift;
(void)out_shift;
(void)bias;
(void)pOut;
/* To be completed */
return NULL;
#endif /* ARM_MATH_DSP */
}

View File

@ -1,245 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_kernel_s8_s16.c
* Description: Matrix-multiplication function for convolution
*
* $Date: 14. December 2021
* $Revision: V.1.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/*
* Matrix-multiplication function for convolution with per-channel requantization.
*
* Refer header file for details.
*
*/
q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
const q15_t *input_b,
const uint16_t output_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int16_t activation_min,
const int16_t activation_max,
const uint16_t num_col_a,
const int32_t *const output_bias,
q7_t *out_0)
{
#if !defined(ARM_MATH_MVEI)
/* set up the second output pointers */
q7_t *out_1 = out_0 + output_ch;
const int32_t *bias = output_bias;
uint16_t row_count = output_ch / 2;
const q7_t *ip_a0 = input_a;
/* this loop over rows in A */
while (row_count)
{
/* setup pointers for B */
const q15_t *ip_b0 = input_b;
const q15_t *ip_b1 = ip_b0 + num_col_a;
/* align the second pointer for A */
const q7_t *ip_a1 = ip_a0 + num_col_a;
q31_t ch_0_out_0 = 0;
q31_t ch_0_out_1 = 0;
q31_t ch_1_out_0 = 0;
q31_t ch_1_out_1 = 0;
/* Init accumulator with bias for channel N and N + 1 */
if (bias)
{
ch_0_out_0 = *bias;
ch_0_out_1 = *bias++;
ch_1_out_0 = *bias;
ch_1_out_1 = *bias++;
}
#if defined(ARM_MATH_DSP)
uint16_t col_count = num_col_a / 4;
/* accumulate over the vector */
while (col_count)
{
q31_t a01, a02, a11, a12;
q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
ip_a0 = read_and_pad(ip_a0, &a01, &a02);
ip_a1 = read_and_pad(ip_a1, &a11, &a12);
ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
b0 = arm_nn_read_q15x2_ia(&ip_b0);
b1 = arm_nn_read_q15x2_ia(&ip_b1);
ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
col_count--;
} /* while over col_count */
col_count = num_col_a & 0x3;
#else
uint16_t col_count = num_col_a;
#endif
while (col_count)
{
q7_t a0 = *ip_a0++;
q15_t b0 = *ip_b0++;
q7_t a1 = *ip_a1++;
q15_t b1 = *ip_b1++;
ch_0_out_0 += a0 * b0;
ch_0_out_1 += a0 * b1;
ch_1_out_0 += a1 * b0;
ch_1_out_1 += a1 * b1;
col_count--;
} /* while over col_count */
ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
ch_0_out_0 += out_offset;
ch_0_out_0 = MAX(ch_0_out_0, activation_min);
ch_0_out_0 = MIN(ch_0_out_0, activation_max);
*out_0++ = (q7_t)ch_0_out_0;
ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
ch_0_out_1 += out_offset;
ch_0_out_1 = MAX(ch_0_out_1, activation_min);
ch_0_out_1 = MIN(ch_0_out_1, activation_max);
*out_1++ = (q7_t)ch_0_out_1;
out_mult++;
out_shift++;
ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift);
ch_1_out_0 += out_offset;
ch_1_out_0 = MAX(ch_1_out_0, activation_min);
ch_1_out_0 = MIN(ch_1_out_0, activation_max);
*out_0++ = (q7_t)ch_1_out_0;
ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift);
ch_1_out_1 += out_offset;
ch_1_out_1 = MAX(ch_1_out_1, activation_min);
ch_1_out_1 = MIN(ch_1_out_1, activation_max);
*out_1++ = (q7_t)ch_1_out_1;
out_mult++;
out_shift++;
/* skip row */
ip_a0 += num_col_a;
row_count--;
}
/* compute the last odd numbered row if any */
if (output_ch & 0x1)
{
/* setup pointers for B */
const q15_t *ip_b0 = input_b;
const q15_t *ip_b1 = ip_b0 + num_col_a;
q31_t ch_0_out_0 = 0;
q31_t ch_0_out_1 = 0;
/* load the bias */
if (bias)
{
ch_0_out_0 = *bias;
ch_0_out_1 = *bias++;
}
#if defined(ARM_MATH_DSP)
uint16_t col_count = num_col_a >> 2;
while (col_count)
{
q31_t a01, a02;
q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
ip_a0 = read_and_pad(ip_a0, &a01, &a02);
ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
b0 = arm_nn_read_q15x2_ia(&ip_b0);
b1 = arm_nn_read_q15x2_ia(&ip_b1);
ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
col_count--;
}
col_count = num_col_a & 0x3;
#else
uint16_t col_count = num_col_a;
#endif
while (col_count)
{
q7_t a0 = *ip_a0++;
q15_t b0 = *ip_b0++;
q15_t b1 = *ip_b1++;
ch_0_out_0 += a0 * b0;
ch_0_out_1 += a0 * b1;
col_count--;
}
ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
ch_0_out_0 += out_offset;
ch_0_out_0 = MAX(ch_0_out_0, activation_min);
ch_0_out_0 = MIN(ch_0_out_0, activation_max);
*out_0++ = (q7_t)ch_0_out_0;
ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
ch_0_out_1 += out_offset;
ch_0_out_1 = MAX(ch_0_out_1, activation_min);
ch_0_out_1 = MIN(ch_0_out_1, activation_max);
*out_1++ = (q7_t)ch_0_out_1;
out_mult++;
out_shift++;
}
out_0 += output_ch;
/* return the new output pointer with offset */
return out_0;
#else
(void)input_a;
(void)input_b;
(void)output_ch;
(void)out_shift;
(void)out_mult;
(void)out_offset;
(void)activation_min;
(void)activation_max;
(void)num_col_a;
(void)output_bias;
(void)out_0;
/* To be completed */
return NULL;
#endif
}

View File

@ -1,201 +0,0 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_kernel_s8_s16_reordered.c
* Description: Matrix-multiplication function for convolution with reordered columns
*
* $Date: 09. October 2020
* $Revision: V.1.0.3
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/*
* Matrix-multiplication with re-ordered input and bias inputs for convolution with per-channel
* requantization. The re-ordering is a consequence of sign extension is done by the SXTB16 command.
*
* Refer header file for details. This function differs from arm_nn_mat_mult_kernel_s8_s16(), in that it uses
* read_and_pad_reordered() instead of arm_nn_mat_mult_kernel_s8_s16(). Investigating the cycles impact and
* unifying these two functions is a potential future improvement.
*
*/
q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a,
const q15_t *input_b,
const uint16_t output_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int16_t activation_min,
const int16_t activation_max,
const uint16_t num_col_a,
const int32_t *const output_bias,
q7_t *out_0)
{
#if defined(ARM_MATH_DSP)
/* set up the second output pointers */
q7_t *out_1 = out_0 + output_ch;
const int32_t *bias = output_bias;
uint16_t row_count = output_ch / 2;
const q7_t *ip_a0 = input_a;
/* this loop over rows in A */
while (row_count)
{
/* setup pointers for B */
const q15_t *ip_b0 = input_b;
const q15_t *ip_b1 = ip_b0 + num_col_a;
/* align the second pointer for A */
const q7_t *ip_a1 = ip_a0 + num_col_a;
/* Init accumulator with bias for channel N and N + 1 */
q31_t ch_0_out_0 = *bias;
q31_t ch_0_out_1 = *bias++;
q31_t ch_1_out_0 = *bias;
q31_t ch_1_out_1 = *bias++;
uint16_t col_count = num_col_a / 4;
/* accumulate over the vector */
while (col_count)
{
q31_t a01, a02, a11, a12;
q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
ip_a1 = read_and_pad_reordered(ip_a1, &a11, &a12);
ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
b0 = arm_nn_read_q15x2_ia(&ip_b0);
b1 = arm_nn_read_q15x2_ia(&ip_b1);
ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
col_count--;
} /* while over col_count */
ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
ch_0_out_0 += out_offset;
ch_0_out_0 = MAX(ch_0_out_0, activation_min);
ch_0_out_0 = MIN(ch_0_out_0, activation_max);
*out_0++ = (q7_t)ch_0_out_0;
ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
ch_0_out_1 += out_offset;
ch_0_out_1 = MAX(ch_0_out_1, activation_min);
ch_0_out_1 = MIN(ch_0_out_1, activation_max);
*out_1++ = (q7_t)ch_0_out_1;
out_mult++;
out_shift++;
ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift);
ch_1_out_0 += out_offset;
ch_1_out_0 = MAX(ch_1_out_0, activation_min);
ch_1_out_0 = MIN(ch_1_out_0, activation_max);
*out_0++ = (q7_t)ch_1_out_0;
ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift);
ch_1_out_1 += out_offset;
ch_1_out_1 = MAX(ch_1_out_1, activation_min);
ch_1_out_1 = MIN(ch_1_out_1, activation_max);
*out_1++ = (q7_t)ch_1_out_1;
out_mult++;
out_shift++;
/* skip row */
ip_a0 += num_col_a;
row_count--;
}
if (output_ch & 1)
{
/* setup pointers for B */
const q15_t *ip_b0 = input_b;
const q15_t *ip_b1 = ip_b0 + num_col_a;
/* Init accumulator with bias for channel N + 1 */
q31_t ch_0_out_0 = *bias;
q31_t ch_0_out_1 = ch_0_out_0;
int32_t col_count = num_col_a / 4;
while (col_count)
{
q31_t a01, a02;
q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
b0 = arm_nn_read_q15x2_ia(&ip_b0);
b1 = arm_nn_read_q15x2_ia(&ip_b1);
ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
col_count--;
} /* while over col_count */
ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
ch_0_out_0 += out_offset;
ch_0_out_0 = MAX(ch_0_out_0, activation_min);
ch_0_out_0 = MIN(ch_0_out_0, activation_max);
*out_0++ = (q7_t)ch_0_out_0;
ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
ch_0_out_1 += out_offset;
ch_0_out_1 = MAX(ch_0_out_1, activation_min);
ch_0_out_1 = MIN(ch_0_out_1, activation_max);
*out_1++ = (q7_t)ch_0_out_1;
}
out_0 += output_ch;
/* return the new output pointer with offset */
return out_0;
#else
(void)input_a;
(void)input_b;
(void)output_ch;
(void)out_shift;
(void)out_mult;
(void)out_offset;
(void)activation_min;
(void)activation_max;
(void)num_col_a;
(void)output_bias;
(void)out_0;
/* To be completed */
return NULL;
#endif
}

View File

@ -1,180 +0,0 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_s8.c
* Description: General Matrix-multiplication function
*
* $Date: 27. October 2021
* $Revision: V.2.0.6
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/*
* s8 General matrix multiplication function with per-channel requantization for upto 4 column batches.
*
* Refer header file for details.
*
*/
q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
const q7_t *input_col,
const uint16_t output_ch,
const uint16_t col_batches,
const int32_t *output_shift,
const int32_t *output_mult,
const int32_t out_offset,
const int32_t col_offset,
const int32_t row_offset,
const int16_t activation_min,
const int16_t activation_max,
const uint16_t row_len,
const int32_t *const bias,
q7_t *out)
{
#if defined(ARM_MATH_MVEI)
(void)row_offset;
if (col_batches == 4)
{
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32_t row_len_tmp = row_len;
const int8_t *ip_r0 = input_row + (i_out_ch * row_len);
const int8_t *ip_c0 = input_col;
const int8_t *ip_c1 = input_col + row_len;
const int8_t *ip_c2 = input_col + (2 * row_len);
const int8_t *ip_c3 = input_col + (3 * row_len);
int32_t acc_0 = 0;
int32_t acc_1 = 0;
int32_t acc_2 = 0;
int32_t acc_3 = 0;
const int32_t row_loop_cnt = (row_len + 7) / 8;
for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
{
mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
row_len_tmp -= 8;
int16x8_t c0 = vldrbq_s16(ip_c0);
ip_c0 += 8;
c0 = vaddq_s16(c0, offset);
int16x8_t c1 = vldrbq_s16(ip_c1);
ip_c1 += 8;
c1 = vaddq_s16(c1, offset);
int16x8_t c2 = vldrbq_s16(ip_c2);
ip_c2 += 8;
c2 = vaddq_s16(c2, offset);
int16x8_t c3 = vldrbq_s16(ip_c3);
ip_c3 += 8;
c3 = vaddq_s16(c3, offset);
int16x8_t r0 = vldrbq_z_s16(ip_r0, p);
ip_r0 += 8;
acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p);
acc_1 = vmladavaq_p_s16(acc_1, r0, c1, p);
acc_2 = vmladavaq_p_s16(acc_2, r0, c2, p);
acc_3 = vmladavaq_p_s16(acc_3, r0, c3, p);
}
int32x4_t res = {acc_0, acc_1, acc_2, acc_3};
if (bias)
{
res = vaddq_n_s32(res, bias[i_out_ch]);
}
res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
res = vaddq_n_s32(res, out_offset);
res = vmaxq_s32(res, vdupq_n_s32(activation_min));
res = vminq_s32(res, vdupq_n_s32(activation_max));
const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3};
vstrbq_scatter_offset_s32(&out[i_out_ch], scatter_offset, res);
}
out += 4 * output_ch;
}
else
{
for (int i_col_batch = (col_batches & ~0x3); i_col_batch < (col_batches & 0x3); i_col_batch++)
{
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32_t row_len_tmp = row_len;
const int8_t *ip_r0 = input_row + (i_out_ch * row_len);
const int8_t *ip_c0 = input_col + (i_col_batch * row_len);
int32_t acc_0 = 0;
const int32_t row_loop_cnt = (row_len + 7) / 8;
for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
{
const mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
row_len_tmp -= 8;
int16x8_t c0 = vldrbq_s16(ip_c0);
ip_c0 += 8;
c0 = vaddq_s16(c0, offset);
int16x8_t r0 = vldrbq_z_s16(ip_r0, p);
ip_r0 += 8;
acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p);
}
if (bias)
{
acc_0 += bias[i_out_ch];
}
acc_0 = arm_nn_requantize(acc_0, output_mult[i_out_ch], output_shift[i_out_ch]);
acc_0 += out_offset;
acc_0 = MAX(acc_0, activation_min);
acc_0 = MIN(acc_0, activation_max);
out[i_out_ch] = (q7_t)acc_0;
}
out += output_ch;
}
}
return out;
#else
(void)input_row;
(void)input_col;
(void)output_ch;
(void)col_batches;
(void)output_shift;
(void)output_mult;
(void)out_offset;
(void)col_offset;
(void)row_offset;
(void)activation_min;
(void)activation_max;
(void)row_len;
(void)bias;
(void)out;
return NULL;
#endif
}

View File

@ -1,21 +0,0 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_s8.c")
target_sources(cmsis-nn PRIVATE ${SRC} arm_fully_connected_s16.c)

Some files were not shown because too many files have changed in this diff Show More