В модель добавлена библиотека CMSIS-DSP и вообще все либы CMSIS

This commit is contained in:
2025-11-13 17:14:43 +03:00
parent 75bed20511
commit 5299cc5b12
1074 changed files with 380657 additions and 7 deletions

View File

@@ -0,0 +1,29 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
cmake_minimum_required(VERSION 3.15.6)
project(CMSISNN)
set(CMSIS_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../..")
option(BUILD_CMSIS_NN_FUNCTIONS "Build CMSIS-NN Source." ON)
if(BUILD_CMSIS_NN_FUNCTIONS)
add_subdirectory(Source)
endif()

View File

@@ -0,0 +1,169 @@
/******************************************************************************
* @file arm_nn_math_types.h
* @brief Compiler include and basic types
* @version V1.1.0
* @date 09 March 2022
* Target Processor: Cortex-M
******************************************************************************/
/*
* Copyright (c) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
Copied from CMSIS/DSP/arm_math_types.h and modified
*/
#ifndef _ARM_NN_MATH_TYPES_H_
#define _ARM_NN_MATH_TYPES_H_
/* DSP inlcude for enum arm_status. */
#include "arm_math_types.h"
#ifdef __cplusplus
extern "C" {
#endif
/* Compiler specific diagnostic adjustment */
#if defined(__CC_ARM)
#elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
#elif defined(__GNUC__)
#elif defined(__ICCARM__)
#elif defined(__TI_ARM__)
#elif defined(__CSMC__)
#elif defined(__TASKING__)
#elif defined(_MSC_VER)
#else
#error Unknown compiler
#endif
/* Included for instrinsics definitions */
#if defined(_MSC_VER)
#include <stdint.h>
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE static __forceinline
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static __inline
#endif
#ifndef __ALIGNED
#define __ALIGNED(x) __declspec(align(x))
#endif
#elif defined(__GNUC_PYTHON__)
#include <stdint.h>
#ifndef __ALIGNED
#define __ALIGNED(x) __attribute__((aligned(x)))
#endif
#ifndef __STATIC_FORCEINLINE
#define __STATIC_FORCEINLINE static inline __attribute__((always_inline))
#endif
#ifndef __STATIC_INLINE
#define __STATIC_INLINE static inline
#endif
#else
#include "cmsis_compiler.h"
#endif
#include <float.h>
#include <limits.h>
#include <math.h>
#include <string.h>
/* evaluate ARM DSP feature */
#if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
#ifndef ARM_MATH_DSP
#define ARM_MATH_DSP 1
#endif
#endif
#if __ARM_FEATURE_MVE
#ifndef ARM_MATH_MVEI
#define ARM_MATH_MVEI
#endif
#endif
/* Compiler specific diagnostic adjustment */
#if defined(__CC_ARM)
#elif defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
#elif defined(__GNUC__)
// #pragma GCC diagnostic pop
#elif defined(__ICCARM__)
#elif defined(__TI_ARM__)
#elif defined(__CSMC__)
#elif defined(__TASKING__)
#elif defined(_MSC_VER)
#else
#error Unknown compiler
#endif
#ifdef __cplusplus
}
#endif
#if __ARM_FEATURE_MVE
#include <arm_mve.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief Add necessary typedefs
*/
#define NN_Q31_MAX ((q31_t)(0x7FFFFFFFL))
#define NN_Q15_MAX ((q15_t)(0x7FFF))
#define NN_Q7_MAX ((q7_t)(0x7F))
#define NN_Q31_MIN ((q31_t)(0x80000000L))
#define NN_Q15_MIN ((q15_t)(0x8000))
#define NN_Q7_MIN ((q7_t)(0x80))
/**
* @brief Error status returned by some functions in the library.
*/
typedef enum
{
ARM_CMSIS_NN_SUCCESS = 0, /**< No error */
ARM_CMSIS_NN_ARG_ERROR = -1, /**< One or more arguments are incorrect */
ARM_CMSIS_NN_NO_IMPL_ERROR = -2, /**< No implementation available */
} arm_cmsis_nn_status;
#ifdef __cplusplus
}
#endif
#endif /*ifndef _ARM_NN_MATH_TYPES_H_ */

View File

@@ -0,0 +1,56 @@
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_tables.h
* Description: Extern declaration for NN tables
*
* $Date: 17. August 2021
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _ARM_NN_TABLES_H
#define _ARM_NN_TABLES_H
#include "arm_nn_math_types.h"
/**
* @brief tables for various activation functions
*
*/
extern const q15_t sigmoidTable_q15[256];
extern const q7_t sigmoidTable_q7[256];
extern const q7_t tanhTable_q7[256];
extern const q15_t tanhTable_q15[256];
/**
* @brief 2-way tables for various activation functions
*
* 2-way table, H table for value larger than 1/4
* L table for value smaller than 1/4, H table for remaining
* We have this only for the q15_t version. It does not make
* sense to have it for q7_t type
*/
extern const q15_t sigmoidHTable_q15[192];
extern const q15_t sigmoidLTable_q15[128];
#endif /* ARM_NN_TABLES_H */

View File

@@ -0,0 +1,137 @@
/*
* Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_types.h
* Description: Public header file to contain the CMSIS-NN structs for the
* TensorFlowLite micro compliant functions
*
* $Date: 22. Februari 2022
* $Revision: V.2.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#ifndef _ARM_NN_TYPES_H
#define _ARM_NN_TYPES_H
#include <stdint.h>
/** CMSIS-NN object to contain the width and height of a tile */
typedef struct
{
int32_t w; /**< Width */
int32_t h; /**< Height */
} cmsis_nn_tile;
/** CMSIS-NN object used for the function context. */
typedef struct
{
void *buf; /**< Pointer to a buffer needed for the optimization */
int32_t size; /**< Buffer size */
} cmsis_nn_context;
/** CMSIS-NN object to contain the dimensions of the tensors */
typedef struct
{
int32_t n; /**< Generic dimension to contain either the batch size or output channels.
Please refer to the function documentation for more information */
int32_t h; /**< Height */
int32_t w; /**< Width */
int32_t c; /**< Input channels */
} cmsis_nn_dims;
/** CMSIS-NN object for the per-channel quantization parameters */
typedef struct
{
int32_t *multiplier; /**< Multiplier values */
int32_t *shift; /**< Shift values */
} cmsis_nn_per_channel_quant_params;
/** CMSIS-NN object for the per-tensor quantization parameters */
typedef struct
{
int32_t multiplier; /**< Multiplier value */
int32_t shift; /**< Shift value */
} cmsis_nn_per_tensor_quant_params;
/** CMSIS-NN object for the quantized Relu activation */
typedef struct
{
int32_t min; /**< Min value used to clamp the result */
int32_t max; /**< Max value used to clamp the result */
} cmsis_nn_activation;
/** CMSIS-NN object for the convolution layer parameters */
typedef struct
{
int32_t input_offset; /**< Zero value for the input tensor */
int32_t output_offset; /**< Zero value for the output tensor */
cmsis_nn_tile stride;
cmsis_nn_tile padding;
cmsis_nn_tile dilation;
cmsis_nn_activation activation;
} cmsis_nn_conv_params;
/** CMSIS-NN object for Depthwise convolution layer parameters */
typedef struct
{
int32_t input_offset; /**< Zero value for the input tensor */
int32_t output_offset; /**< Zero value for the output tensor */
int32_t ch_mult; /**< Channel Multiplier. ch_mult * in_ch = out_ch */
cmsis_nn_tile stride;
cmsis_nn_tile padding;
cmsis_nn_tile dilation;
cmsis_nn_activation activation;
} cmsis_nn_dw_conv_params;
/** CMSIS-NN object for pooling layer parameters */
typedef struct
{
cmsis_nn_tile stride;
cmsis_nn_tile padding;
cmsis_nn_activation activation;
} cmsis_nn_pool_params;
/** CMSIS-NN object for Fully Connected layer parameters */
typedef struct
{
int32_t input_offset; /**< Zero value for the input tensor */
int32_t filter_offset; /**< Zero value for the filter tensor. Not used */
int32_t output_offset; /**< Zero value for the output tensor */
cmsis_nn_activation activation;
} cmsis_nn_fc_params;
/** CMSIS-NN object for SVDF layer parameters */
typedef struct
{
int32_t rank;
int32_t input_offset; /**< Zero value for the input tensor */
int32_t output_offset; /**< Zero value for the output tensor */
cmsis_nn_activation input_activation;
cmsis_nn_activation output_activation;
} cmsis_nn_svdf_params;
/** CMSIS-NN object for Softmax s16 layer parameters */
typedef struct
{
const int16_t *exp_lut;
const int16_t *one_by_one_lut;
} cmsis_nn_softmax_lut_s16;
#endif // _ARM_NN_TYPES_H

View File

@@ -0,0 +1,2532 @@
/*
* Copyright (C) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
* $Date: 19 April 2022
* $Revision: V.9.0.0
*
* Target Processor: Cortex-M CPUs
* -------------------------------------------------------------------- */
/**
\mainpage CMSIS NN Software Library
*
* Introduction
* ------------
*
* This user manual describes the CMSIS NN software library,
* a collection of efficient neural network kernels developed to maximize the
* performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
*
* The library is divided into a number of functions each covering a specific category:
* - Convolution Functions
* - Activation Functions
* - Fully-connected Layer Functions
* - SVDF Layer Functions
* - Pooling Functions
* - Softmax Functions
* - Basic math Functions
*
* The library has separate functions for operating on different weight and activation data
* types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
* kernels are included in the function description. The implementation details are also
* described in this paper [1].
*
* Supported Processors
* -------
* CMSIS-NN targets Cortex-M processors with typically three different implementations for each function. Each
* targets a different group of processors.
* - Processors without SIMD capability (e.g, Cortex-M0)
* - Processors with DSP extention (e.g Cortex-M4)
* - Processors with MVE extension (e.g Cortex-M55)
* The right implementation is picked through feature flags and the user usually does not have to explicit set it.
*
* Function Classification
* --------
* The functions can be classified into two segments
* - Legacy functions supporting ARM's internal symmetric quantization(8 bits).
* - Functions that support TensorFlow Lite framework with symmetric quantization(8 bits).
*
* The legacy functions can be identified with their suffix of _q7 or _q15 and are no new development is done there.
* The article in [2] describes in detail how to run a network using the legacy functions.
*
* The functions supporting TensorFlow Lite framework is identified by the _s8 suffix and can be invoked from TFL
* micro. The functions are bit exact to TensorFlow Lite. Refer to the TensorFlow's documentation in [3] on how to run
* a TensorFlow Lite model using optimized CMSIS-NN kernels.
*
* Block Diagram
* --------
* \image html CMSIS-NN-OVERVIEW.PNG
*
* Examples
* --------
*
* The library ships with a number of examples which demonstrate how to use the library functions.
*
* Pre-processor Macros
* ------------
*
* Each library project have different pre-processor macros.
*
* - ARM_MATH_DSP:
*
* Define macro ARM_MATH_DSP, If the silicon supports DSP instructions(DSP extension).
*
* - ARM_MATH_MVEI:
*
* Define macro ARM_MATH_MVEI, If the silicon supports M-Profile Vector Extension.
* - ARM_MATH_AUTOVECTORIZE
* Used in conjucture with ARM_MATH_MVEI to let the compiler auto vectorize for the functions that uses inline
* assembly. It does not affect functions that use C or intrinsics.
* - ARM_MATH_BIG_ENDIAN:
*
* Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. This is supported only for the legacy
* functions i.e, functions targetted at TensorFlow Lite do not support big endianness. By default library builds for
* little endian targets.
*
* - ARM_NN_TRUNCATE:
*
* Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
*
*
* Copyright Notice
* ------------
*
* Copyright (C) 2010-2019 Arm Limited. All rights reserved.
*
* [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
*
* [2] Converting a Neural Network for Arm Cortex-M with CMSIS-NN
*
https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides/converting-a-neural-network-for-arm-cortex-m-with-cmsis-nn/single-page
* [3] https://www.tensorflow.org/lite/microcontrollers/library
*
* [4] https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN#legacy-vs-tfl-micro-compliant-apis
*/
/**
* @defgroup groupNN Neural Network Functions
* A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support
* TensorFlow Lite framework.
*/
#ifndef _ARM_NNFUNCTIONS_H
#define _ARM_NNFUNCTIONS_H
#include "arm_nn_math_types.h"
#include "arm_nn_types.h"
#define USE_INTRINSIC
//#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief Struct for specifying activation function types
*
*/
typedef enum
{
ARM_SIGMOID = 0,
/**< Sigmoid activation function */
ARM_TANH = 1,
/**< Tanh activation function */
} arm_nn_activation_type;
/**
* @defgroup NNConv Convolution Functions
*
* Collection of convolution, depthwise convolution functions and their variants.
*
* The convolution is implemented in 2 steps: im2col and GEMM
*
* im2col is a process of converting each patch of image data into
* a column. After im2col, the convolution is computed as matrix-matrix
* multiplication.
*
* To reduce the memory footprint, the im2col is performed partially.
* Each iteration, only a few column (i.e., patches) are generated and
* computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
*
*/
/**
* @brief s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in
cmsis-nn
* to perform the convolution.
*
* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required
* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
* Range of conv_params->input_offset : [-127, 128]
* Range of conv_params->output_offset : [-128, 127]
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* @param[in] input_data Input (activation) data pointer. Data type: int8
* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
* spatial filter dimensions
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* @param[in] bias_data Bias data pointer. Data type: int32
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
* @param[out] output_data Output data pointer. Data type: int8
*
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
* <code>ARM_MATH_SUCCESS</code> on successful completion.
*
*/
arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data);
/**
* @brief Get the required buffer size for arm_convolve_wrapper_s8
*
* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
* Range of conv_params->input_offset : [-127, 128]
* Range of conv_params->output_offset : [-128, 127]
* @param[in] input_dims Input (activation) dimensions. Format: [N, H, W, C_IN]
* @param[in] filter_dims Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
* filter dimensions
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
*
* @return The function returns required buffer size(bytes)
*
*/
int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims);
/**
* @brief s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in
cmsis-nn
* to perform the convolution.
*
* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required
* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
* conv_params->input_offset : Not used
* conv_params->output_offset : Not used
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* @param[in] input_data Input (activation) data pointer. Data type: int16
* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
* spatial filter dimensions
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* @param[in] bias_data Bias data pointer. Data type: int64
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
* @param[out] output_data Output data pointer. Data type: int16
*
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
* <code>ARM_MATH_SUCCESS</code> on successful completion.
*
*/
arm_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data);
/**
* @brief Get the required buffer size for arm_convolve_wrapper_s16
*
* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
* conv_params->input_offset : Not used
* conv_params->output_offset : Not used
* @param[in] input_dims Input (activation) dimensions. Format: [N, H, W, C_IN]
* @param[in] filter_dims Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
* filter dimensions
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
*
* @return The function returns required buffer size(bytes)
*
*/
int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims);
/**
* @brief Basic s8 convolution function
* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
arm_convolve_s8_get_buffer_size will return the buffer_size if required
* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
* Range of conv_params->input_offset : [-127, 128]
* Range of conv_params->output_offset : [-128, 127]
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* @param[in] input_data Input (activation) data pointer. Data type: int8
* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
* spatial filter dimensions
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* @param[in] bias_data Optional bias data pointer. Data type: int32
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
* @param[out] output_data Output data pointer. Data type: int8
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
* 1. Supported framework: TensorFlow Lite micro
* 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
* 3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
*
*/
arm_status arm_convolve_s8(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data);
/**
* @brief Get the required buffer size for s8 convolution function
*
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
* are the spatial filter dimensions
* @return The function returns required buffer size(bytes)
*
*/
int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
/**
* @brief Basic s16 convolution function
* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
arm_convolve_s16_get_buffer_size will return the buffer_size if required
* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
* conv_params->input_offset : Not used
* conv_params->output_offset : Not used
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* @param[in] input_data Input (activation) data pointer. Data type: int16
* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
* spatial filter dimensions
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* @param[in] bias_data Optional bias data pointer. Data type: int64
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
* @param[out] output_data Output data pointer. Data type: int16
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
* 1. Supported framework: TensorFlow Lite micro
* 2. q7/q15 is used as data type eventhough it is s8/s16 data. It is done so to be consistent with existing APIs.
* 3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
*
*/
arm_status arm_convolve_s16(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data);
/**
* @brief Optimized s16 convolution function
* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
arm_convolve_fast_s16_get_buffer_size will return the buffer_size if required
* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
* conv_params->input_offset : Not used
* conv_params->output_offset : Not used
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* @param[in] input_data Input (activation) data pointer. Data type: int16
* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
* spatial filter dimensions. (filter_dims->w * filter_dims->h * input_dims->c) must not
exceed 512
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* @param[in] bias_data Optional bias data pointer. Data type: int64
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
* @param[out] output_data Output data pointer. Data type: int16
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
* 1. Supported framework: TensorFlow Lite micro
* 2. q7/q15 is used as data type eventhough it is s8/s16 data. It is done so to be consistent with existing APIs.
* 3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
* 4. Implementation supports kernel volumes (filter width * filter height * input channels) < 512.
*
*/
arm_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data);
/**
* @brief Get the required buffer size for s16 convolution function
*
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
* are the spatial filter dimensions
* @return The function returns required buffer size(bytes)
*
*/
int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
/**
* @brief Get the required buffer size for fast s16 convolution function
*
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
* are the spatial filter dimensions
* @return The function returns required buffer size(bytes)
*
*/
int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
/**
* @brief Basic Q7 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimension
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB);
/**
* @brief Basic Q7 convolution function (non-square shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimension x
* @param[in] dim_im_in_y input tensor dimension y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*/
arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB);
/**
* @brief Basic Q15 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimension
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q15_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q15_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB);
/**
* @brief Fast Q7 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimension
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is the version with full list of optimization tricks, but with
* some contraints:
* ch_im_in is multiple of 4
* ch_im_out is multiple of 2
*/
arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB);
/**
* @brief Fast Q7 convolution function (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimension x
* @param[in] dim_im_in_y input tensor dimension y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is the version with full list of optimization tricks, but with
* some contraints:
* ch_im_in is multiple of 4
* ch_im_out is multiple of 2
*/
arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB);
/**
* @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimension x
* @param[in] dim_im_in_y input tensor dimension y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
* <code>ARM_MATH_SUCCESS</code> on successful completion.
*
* This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
* and dim_kernel_y=1). It can be used for
* second half of MobileNets after depthwise separable convolution.
*
* This function is the version with full list of optimization tricks, but with
* some contraints:
* ch_im_in is multiple of 4
* ch_im_out is multiple of 2
*/
arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB);
/**
* @brief Fast s8 version for 1x1 convolution (non-square shape)
*
* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
arm_convolve_1x1_s8_fast_get_buffer_size will return the buffer_size if required
* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
* Range of conv_params->input_offset : [-127, 128]
* Range of conv_params->output_offset : [-128, 127]
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* @param[in] input_data Input (activation) data pointer. Data type: int8
* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* @param[in] bias_data Optional bias data pointer. Data type: int32
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
* @param[out] output_data Output data pointer. Data type: int8
*
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
* <code>ARM_MATH_SUCCESS</code> on successful completion.
*
* @details
* - Supported framework : TensorFlow Lite Micro
* - The following constrains on the arguments apply
* -# input_dims->c is a multiple of 4
* -# conv_params->padding.w = conv_params->padding.h = 0
* -# conv_params->stride.w = conv_params->stride.h = 1
*
*/
arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data);
/**
* @brief Get the required buffer size for arm_convolve_1x1_s8_fast
*
* @param[in] input_dims Input (activation) dimensions
* @return The function returns the required buffer size in bytes
*
*/
int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims);
/**
* @brief 1xn convolution
*
* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
arm_convolve_1_x_n_s8_get_buffer_size will return the buffer_size if required
* @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
* Range of conv_params->input_offset : [-127, 128]
* Range of conv_params->output_offset : [-128, 127]
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* @param[in] input_data Input (activation) data pointer. Data type: int8
* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal
* spatial filter dimension
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* @param[in] bias_data Optional bias data pointer. Data type: int32
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
* @param[out] output_data Output data pointer. Data type: int8
*
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
* <code>ARM_MATH_SUCCESS</code> on successful completion.
*
* @details
* - Supported framework : TensorFlow Lite Micro
* - The following constrains on the arguments apply
* -# input_dims->n equals 1
* -# ouput_dims->w is a multiple of 4
* -# Explicit constraints(since it is for 1xN convolution)
* -## input_dims->h equals 1
* -## output_dims->h equals 1
* -## filter_dims->h equals 1
*@todo Remove constraint on output_dims->w to make the function generic.
*
*/
arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data);
/**
* @brief Get the required additional buffer size for 1xn convolution
*
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the
* horizontal spatial filter dimension
* @return The function returns required buffer size(bytes)
*
*/
int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
/**
* @brief Q7 version of convolution for RGB image
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimension
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This kernel is written exclusively for convolution with ch_im_in
* equals 3. This applies on the first layer of CNNs which has input
* image with RGB format.
*/
arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB);
/**
* @brief Fast Q15 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimension
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is the version with full list of optimization tricks, but with
* some contraints:
* ch_im_in is multiple of 2
* ch_im_out is multiple of 2
* dim_im_out is a multiple of 2
*/
arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q15_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q15_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB);
/**
* @brief Fast Q15 convolution function (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimension x
* @param[in] dim_im_in_y input tensor dimension y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in is multiple of 2
*
* ch_im_out is multipe of 2
*
*/
arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q15_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q15_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB);
/**
* @brief Q7 depthwise separable convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimension
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is the version with full list of optimization tricks, but with
* some contraints:
* ch_im_in is multiple of 2
* ch_im_out is multiple of 2
*/
arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB);
/**
* @brief Q7 depthwise separable convolution function (non-square shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimension x
* @param[in] dim_im_in_y input tensor dimension y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding sizes x
* @param[in] padding_y padding sizes y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is the version with full list of optimization tricks, but with
* some contraints:
* ch_im_in is multiple of 2
* ch_im_out is multiple of 2
*/
arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB);
/**
* @brief Wrapper function to pick the right optimized s8 depthwise convolution function
*
* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
* definition file to see if an additional buffer is required.
* Optional function {API}_get_buffer_size() provides the buffer
* size if required.
* @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
* dw_conv_params->dilation is not used.
* Range of dw_conv_params->input_offset : [-127, 128]
* Range of dw_conv_params->output_offset : [-128, 127]
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each
* output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
* Batch argument N is not used and assumed to be 1.
* @param[in] input_data Input (activation) data pointer. Data type: int8
* @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* @param[in] bias_data Bias data pointer. Data type: int32
* @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT]
* @param[in, out] output_data Output data pointer. Data type: int8
* @return The function returns
* <code>ARM_MATH_SUCCESS</code> - Successful completion.
*
* @details
* - Supported framework: TensorFlow Lite
* - Picks one of the the following functions
* -# arm_depthwise_conv_s8()
* -# arm_depthwise_conv_3x3_s8() - Cortex-M CPUs with DSP extension only
* -# arm_depthwise_conv_s8_opt()
* - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
* - Check details of arm_depthwise_conv_s8_opt() for potential data that can be accessed outside of the
* boundary.
*/
arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data);
/**
* @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8()
*
* @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
* dw_conv_params->dilation is not used.
* Range of dw_conv_params->input_offset : [-127, 128]
* Range of dw_conv_params->input_offset : [-128, 127]
* @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
* Batch argument N is not used and assumed to be 1.
* @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
* @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT]
* @return Size of additional memory required for optimizations in bytes.
*
*/
int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims);
/**
* @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions.
*
* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
* definition file to see if an additional buffer is required.
* Optional function {API}_get_buffer_size() provides the buffer
* size if an additional buffer is required.
* exists if additional memory is.
* @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
* dw_conv_params->dilation is not used.
* Range of dw_conv_params->input_offset : [-127, 128]
* Range of dw_conv_params->input_offset : [-128, 127]
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each
* output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* Batch argument N is not used.
* @param[in] input_data Input (activation) data pointer. Data type: int8
* @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* @param[in] bias_data Bias data pointer. Data type: int32
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
* @param[in, out] output_data Output data pointer. Data type: int8
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
* - Supported framework: TensorFlow Lite
* - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
*/
arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data);
/**
* @brief Basic s16 depthwise convolution function that doesn't have any constraints on the input dimensions.
*
* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
* definition file to see if an additional buffer is required.
* Optional function {API}_get_buffer_size() provides the buffer
* size if an additional buffer is required.
* exists if additional memory is.
* @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
* conv_params->input_offset : Not used
* conv_params->output_offset : Not used
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each
* output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* Batch argument N is not used.
* @param[in] input_data Input (activation) data pointer. Data type: int8
* @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* @param[in] bias_data Bias data pointer. Data type: int64
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
* @param[in, out] output_data Output data pointer. Data type: int16
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
* - Supported framework: TensorFlow Lite
* - q15 is used as data type eventhough it is s16 data. It is done so to be consistent with existing APIs.
*/
arm_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data);
/**
* @brief Optimized s8 depthwise convolution function for 3x3 kernel size with some constraints on
* the input arguments(documented below). Refer arm_depthwise_conv_s8() for function
* argument details.
*
* @return The function returns one of the following
* <code>ARM_MATH_SIZE_MISMATCH</code> - Unsupported dimension of tensors
* <code>ARM_MATH_ARGUMENT_ERROR</code> - Unsupported pad size along the x axis
* <code>ARM_MATH_SUCCESS</code> - Successful operation
*
* @details
* - Supported framework : TensorFlow Lite Micro
* - The following constrains on the arguments apply
* -# Number of input channel equals number of output channels
* -# Filter height and width equals 3
* -# Padding along x is either 0 or 1.
*
*/
arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data);
/**
* @brief Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel.
* Refer arm_depthwise_conv_s8() for function argument details.
*
* @return The function returns one of the following
* <code>ARM_MATH_SIZE_MISMATCH</code> - input channel != output channel or
* ch_mult != 1
* <code>ARM_MATH_SUCCESS</code> - Successful operation
*
* @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
* for the following if MVE optimizations(Arm Helium Technology) are used.
* - Output shift
* - Output multiplier
* - Output bias
* - kernel
* @details
* - Supported framework: TensorFlow Lite
* - The following constrains on the arguments apply
* -# Number of input channel equals number of output channels or ch_mult equals 1
* - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
* - Reccomended when number of channels is 4 or greater.
*
*/
arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data);
/**
* @brief Get the required buffer size for optimized s8 depthwise convolution
* function with constraint that in_channel equals out_channel.
* @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
* Batch argument N is not used.
* @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
* @return The function returns required buffer size in bytes
*
*/
int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
/**
* @defgroup FC Fully-connected Layer Functions
*
* Collection of fully-connected and matrix multiplication functions.
*
* Fully-connected layer is basically a matrix-vector multiplication
* with bias. The matrix is the weights and the input/output vectors
* are the activation values. Supported {weight, activation} precisions
* include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
*
* Here we have two types of kernel functions. The basic function
* implements the function using regular GEMV approach. The opt functions
* operates with weights in interleaved formats.
*
*/
/**
*@brief Q7 basic fully-connected layer function
*@param[in] pV pointer to input vector
*@param[in] pM pointer to matrix weights
*@param[in] dim_vec length of the vector
*@param[in] num_of_rows number of rows in weight matrix
*@param[in] bias_shift amount of left-shift for bias
*@param[in] out_shift amount of right-shift for output
*@param[in] bias pointer to bias
*@param[in,out] pOut pointer to output vector
*@param[in,out] vec_buffer pointer to buffer space for input
*@return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_fully_connected_q7(const q7_t *pV,
const q7_t *pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q7_t *pOut,
q15_t *vec_buffer);
/**
* @brief Basic s8 Fully Connected function.
*
* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
* definition file to see if an additional buffer is required.
* Optional function {API}_get_buffer_size() provides the buffer
* size if an additional buffer is required.
* @param[in] fc_params Fully Connected layer parameters.
* Range of fc_params->input_offset : [-127, 128]
* fc_params->filter_offset : 0
* Range of fc_params->output_offset : [-128, 127]
* @param[in] quant_params Per-tensor quantization info.
* It contains the multiplier and shift values to be applied to the output tensor.
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* Input dimension is taken as Nx(H * W * C_IN)
* @param[in] input_data Input (activation) data pointer. Data type: int8
* @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C]
* N : accumulation depth and equals (H * W * C_IN) from input_dims
* C : output depth and equals C_OUT in output_dims
* H & W : Not used
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* N, H, W : Not used
* @param[in] bias_data Bias data pointer. Data type: int32
* @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT]
* N : Batches
* C_OUT : Output depth
* H & W : Not used.
* @param[in, out] output_data Output data pointer. Data type: int8
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
* - Supported framework: TensorFlow Lite
* - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
*/
arm_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
const cmsis_nn_fc_params *fc_params,
const cmsis_nn_per_tensor_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data);
/**
* @brief Get the required buffer size for S8 basic fully-connected and
* matrix multiplication layer function for TF Lite
* @param[in] filter_dims dimension of filter
* @return The function returns required buffer size in bytes
*
*/
int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
/**
* @brief Basic s16 Fully Connected function.
*
* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
* definition file to see if an additional buffer is required.
* Optional function {API}_get_buffer_size() provides the buffer
* size if an additional buffer is required.
* @param[in] fc_params Fully Connected layer parameters.
* fc_params->input_offset : 0
* fc_params->filter_offset : 0
* fc_params->output_offset : 0
* @param[in] quant_params Per-tensor quantization info.
* It contains the multiplier and shift values to be applied to the output tensor.
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* Input dimension is taken as Nx(H * W * C_IN)
* @param[in] input_data Input (activation) data pointer. Data type: int16
* @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C]
* N : accumulation depth and equals (H * W * C_IN) from input_dims
* C : output depth and equals C_OUT in output_dims
* H & W : Not used
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* N, H, W : Not used
* @param[in] bias_data Bias data pointer. Data type: int64
* @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT]
* N : Batches
* C_OUT : Output depth
* H & W : Not used.
* @param[in, out] output_data Output data pointer. Data type: int16
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
* - Supported framework: TensorFlow Lite
* - q15 is used as data type eventhough it is s16 data. It is done so to be consistent with existing APIs.
*/
arm_status arm_fully_connected_s16(const cmsis_nn_context *ctx,
const cmsis_nn_fc_params *fc_params,
const cmsis_nn_per_tensor_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data);
/**
* @brief Get the required buffer size for S16 basic fully-connected and
* matrix multiplication layer function for TF Lite
* @param[in] filter_dims dimension of filter
* @return The function returns required buffer size in bytes
*
*/
int32_t arm_fully_connected_s16_get_buffer_size(const cmsis_nn_dims *filter_dims);
/**
* @brief Q7 opt fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_fully_connected_q7_opt(const q7_t *pV,
const q7_t *pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q7_t *pOut,
q15_t *vec_buffer);
/**
* @brief Q15 basic fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_fully_connected_q15(const q15_t *pV,
const q15_t *pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q15_t *bias,
q15_t *pOut,
q15_t *vec_buffer);
/**
* @brief Q15 opt fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_fully_connected_q15_opt(const q15_t *pV,
const q15_t *pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q15_t *bias,
q15_t *pOut,
q15_t *vec_buffer);
/**
* @brief Mixed Q15-Q7 fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t *pV,
const q7_t *pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q15_t *pOut,
q15_t *vec_buffer);
/**
* @brief Mixed Q15-Q7 opt fully-connected layer function
* @param[in] pV pointer to input vector
* @param[in] pM pointer to matrix weights
* @param[in] dim_vec length of the vector
* @param[in] num_of_rows number of rows in weight matrix
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias pointer to bias
* @param[in,out] pOut pointer to output vector
* @param[in,out] vec_buffer pointer to buffer space for input
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV,
const q7_t *pM,
const uint16_t dim_vec,
const uint16_t num_of_rows,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q15_t *pOut,
q15_t *vec_buffer);
/**
* @brief Matrix-Multiplication Kernels for Convolution
*
* These functions are used within convolution layer functions for
* matrix multiplication.
*
* The implementation is similar to CMSIS-DSP arm_mat_mult functions
* with one Q7 and one Q15 operands. The Q15 operand is the im2col
* output which is always with 2 columns.
*
*/
/**
* @brief Matrix-multiplication function for convolution
* @param[in] pA pointer to operand A
* @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
* @param[in] ch_im_out numRow of A
* @param[in] numCol_A numCol of A
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in] bias the bias
* @param[in,out] pOut pointer to output
* @return The function returns the incremented output pointer
*/
q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t *pA,
const q15_t *pInBuffer,
const uint16_t ch_im_out,
const uint16_t numCol_A,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q7_t *pOut);
#ifdef __cplusplus
}
#endif
/*
* Other functions
* These layers are typically not timing critical
* Basic implementation is supported here
*/
#ifdef __cplusplus
extern "C" {
#endif
/**
* @defgroup BasicMath Basic math functions
*
* Elementwise add and multiplication functions.
*
*/
/**
* @brief s8 elementwise add of two vectors
* @param[in] input_1_vect pointer to input vector 1
* @param[in] input_2_vect pointer to input vector 2
* @param[in] input_1_offset offset for input 1. Range: -127 to 128
* @param[in] input_1_mult multiplier for input 1
* @param[in] input_1_shift shift for input 1
* @param[in] input_2_offset offset for input 2. Range: -127 to 128
* @param[in] input_2_mult multiplier for input 2
* @param[in] input_2_shift shift for input 2
* @param[in] left_shift input left shift
* @param[in,out] output pointer to output vector
* @param[in] out_offset output offset. Range: -128 to 127
* @param[in] out_mult output multiplier
* @param[in] out_shift output shift
* @param[in] out_activation_min minimum value to clamp output to. Min: -128
* @param[in] out_activation_max maximum value to clamp output to. Max: 127
* @param[in] block_size number of samples
* @return The function returns ARM_MATH_SUCCESS
*/
arm_status arm_elementwise_add_s8(const int8_t *input_1_vect,
const int8_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_1_mult,
const int32_t input_1_shift,
const int32_t input_2_offset,
const int32_t input_2_mult,
const int32_t input_2_shift,
const int32_t left_shift,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const int32_t block_size);
/**
* @brief s16 elementwise add of two vectors
* @param[in] input_1_vect pointer to input vector 1
* @param[in] input_2_vect pointer to input vector 2
* @param[in] input_1_offset offset for input 1. Not used.
* @param[in] input_1_mult multiplier for input 1
* @param[in] input_1_shift shift for input 1
* @param[in] input_2_offset offset for input 2. Not used.
* @param[in] input_2_mult multiplier for input 2
* @param[in] input_2_shift shift for input 2
* @param[in] left_shift input left shift
* @param[in,out] output pointer to output vector
* @param[in] out_offset output offset. Not used.
* @param[in] out_mult output multiplier
* @param[in] out_shift output shift
* @param[in] out_activation_min minimum value to clamp output to. Min: -32768
* @param[in] out_activation_max maximum value to clamp output to. Max: 32767
* @param[in] block_size number of samples
* @return The function returns ARM_MATH_SUCCESS
*/
arm_status arm_elementwise_add_s16(const int16_t *input_1_vect,
const int16_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_1_mult,
const int32_t input_1_shift,
const int32_t input_2_offset,
const int32_t input_2_mult,
const int32_t input_2_shift,
const int32_t left_shift,
int16_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const int32_t block_size);
/**
* @brief s8 elementwise multiplication
* @param[in] input_1_vect pointer to input vector 1
* @param[in] input_2_vect pointer to input vector 2
* @param[in] input_1_offset offset for input 1. Range: -127 to 128
* @param[in] input_2_offset offset for input 2. Range: -127 to 128
* @param[in,out] output pointer to output vector
* @param[in] out_offset output offset. Range: -128 to 127
* @param[in] out_mult output multiplier
* @param[in] out_shift output shift
* @param[in] out_activation_min minimum value to clamp output to. Min: -128
* @param[in] out_activation_max maximum value to clamp output to. Max: 127
* @param[in] block_size number of samples
* @return The function returns ARM_MATH_SUCCESS
*
* @details Supported framework: TensorFlow Lite micro
*/
arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
const int8_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_2_offset,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const int32_t block_size);
/**
* @brief s16 elementwise multiplication
* @param[in] input_1_vect pointer to input vector 1
* @param[in] input_2_vect pointer to input vector 2
* @param[in] input_1_offset offset for input 1. Not used.
* @param[in] input_2_offset offset for input 2. Not used.
* @param[in,out] output pointer to output vector
* @param[in] out_offset output offset. Not used.
* @param[in] out_mult output multiplier
* @param[in] out_shift output shift
* @param[in] out_activation_min minimum value to clamp output to. Min: -32768
* @param[in] out_activation_max maximum value to clamp output to. Max: 32767
* @param[in] block_size number of samples
* @return The function returns ARM_MATH_SUCCESS
*
* @details Supported framework: TensorFlow Lite micro
*/
arm_status arm_elementwise_mul_s16(const int16_t *input_1_vect,
const int16_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_2_offset,
int16_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const int32_t block_size);
/**
* @defgroup Acti Activation Functions
*
* Perform activation layers, including ReLU (Rectified Linear Unit),
* sigmoid and tanh
*
*/
/**
* @brief Q7 RELU function
* @param[in,out] data pointer to input
* @param[in] size number of elements
* @return none.
*/
void arm_relu_q7(q7_t *data, uint16_t size);
/**
* @brief s8 ReLU6 function
* @param[in,out] data pointer to input
* @param[in] size number of elements
*/
void arm_relu6_s8(q7_t *data, uint16_t size);
/**
* @brief Q15 RELU function
* @param[in,out] data pointer to input
* @param[in] size number of elements
* @return none.
*/
void arm_relu_q15(q15_t *data, uint16_t size);
/**
* @brief Q7 neural network activation function using direct table look-up
* @param[in,out] data pointer to input
* @param[in] size number of elements
* @param[in] int_width bit-width of the integer part, assume to be smaller than 3
* @param[in] type type of activation functions
* @return none.
*/
void arm_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type);
/**
* @brief Q15 neural network activation function using direct table look-up
* @param[in,out] data pointer to input
* @param[in] size number of elements
* @param[in] int_width bit-width of the integer part, assume to be smaller than 3
* @param[in] type type of activation functions
* @return none.
*
* @details
*
* This is the direct table look-up approach.
*
* Assume here the integer part of the fixed-point is <= 3.
* More than 3 just not making much sense, makes no difference with
* saturation followed by any of these activation functions.
*/
void arm_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type);
/**
* @defgroup Pooling Pooling Functions
*
* Perform pooling functions, including max pooling and average pooling
*
*/
/**
* @brief Q7 max pooling function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimension
* @param[in] ch_im_in number of input tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] Im_out pointer to output tensor
* @return none.
*
*/
void arm_maxpool_q7_HWC(q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const uint16_t dim_im_out,
q7_t *bufferA,
q7_t *Im_out);
/**
* @brief Q7 average pooling function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimension
* @param[in] ch_im_in number of input tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] Im_out pointer to output tensor
* @return none.
*
*/
void arm_avepool_q7_HWC(q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const uint16_t dim_im_out,
q7_t *bufferA,
q7_t *Im_out);
/**
* @brief s8 average pooling function.
*
* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
* definition file to see if an additional buffer is required.
* Optional function {API}_get_buffer_size() provides the buffer
* size if an additional buffer is required.
* @param[in] pool_params Pooling parameters
* @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
* Argument 'N' is not used.
* @param[in] input_data Input (activation) data pointer. Data type: int8
* @param[in] filter_dims Filter tensor dimensions. Format: [H, W]
* Argument N and C are not used.
* @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT]
* Argument N is not used.
* C_OUT equals C_IN.
* @param[in, out] output_data Output data pointer. Data type: int8
* @return The function returns
* <code>ARM_MATH_SUCCESS</code> - Successful operation
*
* @details
* - Supported Framework: TensorFlow Lite
*
*/
arm_status arm_avgpool_s8(const cmsis_nn_context *ctx,
const cmsis_nn_pool_params *pool_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims,
q7_t *output_data);
/**
* @brief Get the required buffer size for S8 average pooling function
* @param[in] dim_dst_width output tensor dimension
* @param[in] ch_src number of input tensor channels
* @return The function returns required buffer size in bytes
*
*/
int32_t arm_avgpool_s8_get_buffer_size(const int dim_dst_width, const int ch_src);
/**
* @brief s16 average pooling function.
*
* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
* definition file to see if an additional buffer is required.
* Optional function {API}_get_buffer_size() provides the buffer
* size if an additional buffer is required.
* @param[in] pool_params Pooling parameters
* @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
* Argument 'N' is not used.
* @param[in] input_data Input (activation) data pointer. Data type: int16
* @param[in] filter_dims Filter tensor dimensions. Format: [H, W]
* Argument N and C are not used.
* @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT]
* Argument N is not used.
* C_OUT equals C_IN.
* @param[in, out] output_data Output data pointer. Data type: int16
* @return The function returns
* <code>ARM_MATH_SUCCESS</code> - Successful operation
*
* @details
* - Supported Framework: TensorFlow Lite
*
*/
arm_status arm_avgpool_s16(const cmsis_nn_context *ctx,
const cmsis_nn_pool_params *pool_params,
const cmsis_nn_dims *input_dims,
const int16_t *input_data,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims,
int16_t *output_data);
/**
* @brief Get the required buffer size for S16 average pooling function
* @param[in] dim_dst_width output tensor dimension
* @param[in] ch_src number of input tensor channels
* @return The function returns required buffer size in bytes
*
*/
int32_t arm_avgpool_s16_get_buffer_size(const int dim_dst_width, const int ch_src);
/**
* @brief s8 max pooling function.
*
* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
* definition file to see if an additional buffer is required.
* Optional function {API}_get_buffer_size() provides the buffer
* size if an additional buffer is required.
* @param[in] pool_params Pooling parameters
* @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
* Argument 'N' is not used.
* @param[in] input_data Input (activation) data pointer. The input tensor must not
* overlap with the output tensor. Data type: int8
* @param[in] filter_dims Filter tensor dimensions. Format: [H, W]
* Argument N and C are not used.
* @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT]
* Argument N is not used.
* C_OUT equals C_IN.
* @param[in, out] output_data Output data pointer. Data type: int8
* @return The function returns
* <code>ARM_MATH_SUCCESS</code> - Successful operation
*
* @details
* - Supported Framework: TensorFlow Lite
*
*/
arm_status arm_max_pool_s8(const cmsis_nn_context *ctx,
const cmsis_nn_pool_params *pool_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims,
q7_t *output_data);
/**
* @brief s16 max pooling function.
*
* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
* definition file to see if an additional buffer is required.
* Optional function {API}_get_buffer_size() provides the buffer
* size if an additional buffer is required.
* @param[in] pool_params Pooling parameters
* @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
* Argument 'N' is not used.
* @param[in] src Input (activation) data pointer. The input tensor must not
* overlap with the output tensor. Data type: int16
* @param[in] filter_dims Filter tensor dimensions. Format: [H, W]
* Argument N and C are not used.
* @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT]
* Argument N is not used.
* C_OUT equals C_IN.
* @param[in, out] dst Output data pointer. Data type: int16
* @return The function returns
* <code>ARM_MATH_SUCCESS</code> - Successful operation
*
* @details
* - Supported Framework: TensorFlow Lite
*
*/
arm_status arm_max_pool_s16(const cmsis_nn_context *ctx,
const cmsis_nn_pool_params *pool_params,
const cmsis_nn_dims *input_dims,
const int16_t *src,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims,
int16_t *dst);
/**
* @defgroup Softmax Softmax Functions
*
* EXP(2) based softmax functions.
*
*/
/**
* @brief Q7 softmax function
* @param[in] vec_in pointer to input vector
* @param[in] dim_vec input vector dimension
* @param[out] p_out pointer to output vector
*
* @note This function is an optimized version which is not bit-accurate with
* TensorFlow Lite's kernel
*
*/
void arm_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out);
/**
* @brief Q7 softmax function with batch parameter
* @param[in] vec_in pointer to input vector
* @param[in] nb_batches number of batches
* @param[in] dim_vec input vector dimension
* @param[out] p_out pointer to output vector
* @return none.
*
* @note This function is an optimized version which is not bit-accurate with
* TensorFlow Lite's kernel
*
*/
void arm_softmax_with_batch_q7(const q7_t *vec_in, const uint16_t nb_batches, const uint16_t dim_vec, q7_t *p_out);
/**
* @brief Q15 softmax function
* @param[in] vec_in pointer to input vector
* @param[in] dim_vec input vector dimension
* @param[out] p_out pointer to output vector
* @return none.
*
* @note This function is an optimized version which is not bit-accurate with
* TensorFlow Lite's kernel
*
*/
void arm_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out);
/**
* @brief S8 softmax function
* @param[in] input Pointer to the input tensor
* @param[in] num_rows Number of rows in the input tensor
* @param[in] row_size Number of elements in each input row
* @param[in] mult Input quantization multiplier
* @param[in] shift Input quantization shift within the range [0, 31]
* @param[in] diff_min Minimum difference with max in row. Used to check if
* the quantized exponential operation can be performed
* @param[out] output Pointer to the output tensor
*
* @note Supported framework: TensorFlow Lite micro (bit-accurate)
*
*/
void arm_softmax_s8(const int8_t *input,
const int32_t num_rows,
const int32_t row_size,
const int32_t mult,
const int32_t shift,
const int32_t diff_min,
int8_t *output);
/**
* @brief S8 to s16 softmax function
* @param[in] input Pointer to the input tensor
* @param[in] num_rows Number of rows in the input tensor
* @param[in] row_size Number of elements in each input row
* @param[in] mult Input quantization multiplier
* @param[in] shift Input quantization shift within the range [0, 31]
* @param[in] diff_min Minimum difference with max in row. Used to check if
* the quantized exponential operation can be performed
* @param[out] output Pointer to the output tensor
*
* @note Supported framework: TensorFlow Lite micro (bit-accurate)
*
*/
void arm_softmax_s8_s16(const int8_t *input,
const int32_t num_rows,
const int32_t row_size,
const int32_t mult,
const int32_t shift,
const int32_t diff_min,
int16_t *output);
/**
* @brief S16 softmax function
* @param[in] input Pointer to the input tensor
* @param[in] num_rows Number of rows in the input tensor
* @param[in] row_size Number of elements in each input row
* @param[in] mult Input quantization multiplier
* @param[in] shift Input quantization shift within the range [0, 31]
* @param[in] softmax_params Softmax s16 layer parameters with two pointers to LUTs speficied below.
* For indexing the high 9 bits are used and 7 remaining for interpolation.
* That means 512 entries for the 9-bit indexing and 1 extra for interpolation, i.e. 513
* values for each LUT.
* - Lookup table for exp(x), where x uniform distributed between [-10.0 , 0.0]
* - Lookup table for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0]
* @param[out] output Pointer to the output tensor
* @return The function returns
* <code>ARM_MATH_ARGUMENT_ERROR</code> if LUTs are NULL
* <code>ARM_MATH_SUCCESS</code> - Successful operation
*
* @note Supported framework: TensorFlow Lite micro (bit-accurate)
*
*/
arm_status arm_softmax_s16(const int16_t *input,
const int32_t num_rows,
const int32_t row_size,
const int32_t mult,
const int32_t shift,
const cmsis_nn_softmax_lut_s16 *softmax_params,
int16_t *output);
/**
* @brief U8 softmax function
* @param[in] input Pointer to the input tensor
* @param[in] num_rows Number of rows in the input tensor
* @param[in] row_size Number of elements in each input row
* @param[in] mult Input quantization multiplier
* @param[in] shift Input quantization shift within the range [0, 31]
* @param[in] diff_min Minimum difference with max in row. Used to check if
* the quantized exponential operation can be performed
* @param[out] output Pointer to the output tensor
*
* @note Supported framework: TensorFlow Lite micro (bit-accurate)
*
*/
void arm_softmax_u8(const uint8_t *input,
const int32_t num_rows,
const int32_t row_size,
const int32_t mult,
const int32_t shift,
const int32_t diff_min,
uint8_t *output);
/**
* @brief uint8 depthwise convolution function with asymmetric quantization
* Unless specified otherwise, arguments are mandatory.
*
* @param[in] input Pointer to input tensor
* @param[in] input_x Width of input tensor
* @param[in] input_y Height of input tensor
* @param[in] input_ch Channels in input tensor
* @param[in] kernel Pointer to kernel weights
* @param[in] kernel_x Width of kernel
* @param[in] kernel_y Height of kernel
* @param[in] ch_mult Number of channel multiplier
* @param[in] pad_x Padding sizes x
* @param[in] pad_y Padding sizes y
* @param[in] stride_x stride along the width
* @param[in] stride_y stride along the height
* @param[in] dilation_x Dilation along width. Not used and intended for future enhancement.
* @param[in] dilation_y Dilation along height. Not used and intended for future enhancement.
* @param[in] bias Pointer to optional bias values. If no bias is
* availble, NULL is expected
* @param[in] input_offset Input tensor zero offset
* @param[in] filter_offset Kernel tensor zero offset
* @param[in] output_offset Output tensor zero offset
* @param[in,out] output Pointer to output tensor
* @param[in] output_x Width of output tensor
* @param[in] output_y Height of output tensor
* @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255}
* @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255}
* @param[in] out_shift Amount of right-shift for output
* @param[in] out_mult Output multiplier for requantization
* @return The function returns the following
* <code>ARM_MATH_SUCCESS</code> - Successful operation
*
*/
arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_ch,
const uint8_t *kernel,
const uint16_t kernel_x,
const uint16_t kernel_y,
const int16_t ch_mult,
const int16_t pad_x,
const int16_t pad_y,
const int16_t stride_x,
const int16_t stride_y,
const int16_t dilation_x,
const int16_t dilation_y,
const int32_t *bias,
const int32_t input_offset,
const int32_t filter_offset,
const int32_t output_offset,
uint8_t *output,
const uint16_t output_x,
const uint16_t output_y,
const int32_t output_activation_min,
const int32_t output_activation_max,
const int32_t out_shift,
const int32_t out_mult);
/**
* @defgroup Reshape Reshape Functions
*
*/
/**
* @brief Reshape a s8 vector into another with different shape
* @param[in] input points to the s8 input vector
* @param[out] output points to the s8 output vector
* @param[in] total_size total size of the input and output vectors in bytes
*
* @note The output is expected to be in a memory area that does not overlap with the input's
*
*/
void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size);
/**
* @defgroup Concatenation Concatenation Functions
*
*/
/**
* @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the X axis
* This function should be called for each input tensor to concatenate. The argument offset_x
* will be used to store the input tensor in the correct position in the output tensor
*
* i.e. offset_x = 0
* for(i = 0 i < num_input_tensors; ++i)
* {
* arm_concatenation_s8_x(&input[i], ..., &output, ..., ..., offset_x)
* offset_x += input_x[i]
* }
*
* This function assumes that the output tensor has:
* -# The same height of the input tensor
* -# The same number of channels of the input tensor
* -# The same batch size of the input tensor
*
* Unless specified otherwise, arguments are mandatory.
*
* @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
* does not involve any arithmetic operation
*
* @param[in] input Pointer to input tensor. Input tensor must not overlap with the output tensor.
* @param[in] input_x Width of input tensor
* @param[in] input_y Height of input tensor
* @param[in] input_z Channels in input tensor
* @param[in] input_w Batch size in input tensor
* @param[out] output Pointer to output tensor. Expected to be at least
* (input_x * input_y * input_z * input_w) + offset_x
* bytes.
* @param[in] output_x Width of output tensor
* @param[in] offset_x The offset (in number of elements) on the X axis to start concatenating the input tensor
* It is user responsibility to provide the correct value
*
* <b> Input constraints</b>
* offset_x is less than output_x
*
*/
void arm_concatenation_s8_x(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_x,
const uint32_t offset_x);
/**
* @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Y axis
* This function should be called for each input tensor to concatenate. The argument offset_y
* will be used to store the input tensor in the correct position in the output tensor
*
* i.e. offset_y = 0
* for(i = 0 i < num_input_tensors; ++i)
* {
* arm_concatenation_s8_y(&input[i], ..., &output, ..., ..., offset_y)
* offset_y += input_y[i]
* }
*
* This function assumes that the output tensor has:
* -# The same width of the input tensor
* -# The same number of channels of the input tensor
* -# The same batch size of the input tensor
*
* Unless specified otherwise, arguments are mandatory.
*
* @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
* does not involve any arithmetic operation
*
* @param[in] input Pointer to input tensor. Input tensor must not overlap with the output tensor.
* @param[in] input_x Width of input tensor
* @param[in] input_y Height of input tensor
* @param[in] input_z Channels in input tensor
* @param[in] input_w Batch size in input tensor
* @param[out] output Pointer to output tensor. Expected to be at least
* (input_z * input_w * input_x * input_y) + offset_y
* bytes.
* @param[in] output_y Height of output tensor
* @param[in] offset_y The offset on the Y axis to start concatenating the input tensor
* It is user responsibility to provide the correct value
*
* <b> Input constraints</b>
* offset_y is less than output_y
*
*/
void arm_concatenation_s8_y(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_y,
const uint32_t offset_y);
/**
* @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Z axis
* This function should be called for each input tensor to concatenate. The argument offset_z
* will be used to store the input tensor in the correct position in the output tensor
*
* i.e. offset_z = 0
* for(i = 0 i < num_input_tensors; ++i)
* {
* arm_concatenation_s8_z(&input[i], ..., &output, ..., ..., offset_z)
* offset_z += input_z[i]
* }
*
* This function assumes that the output tensor has:
* -# The same width of the input tensor
* -# The same height of the input tensor
* -# The same batch size of the input tensor
*
* Unless specified otherwise, arguments are mandatory.
*
* @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
* does not involve any arithmetic operation
*
* @param[in] input Pointer to input tensor. Input tensor must not overlap with output tensor.
* @param[in] input_x Width of input tensor
* @param[in] input_y Height of input tensor
* @param[in] input_z Channels in input tensor
* @param[in] input_w Batch size in input tensor
* @param[out] output Pointer to output tensor. Expected to be at least
* (input_x * input_y * input_z * input_w) + offset_z
* bytes.
* @param[in] output_z Channels in output tensor
* @param[in] offset_z The offset on the Z axis to start concatenating the input tensor
* It is user responsibility to provide the correct value
*
* <b> Input constraints</b>
* offset_z is less than output_z
*
*/
void arm_concatenation_s8_z(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_z,
const uint32_t offset_z);
/**
* @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the W axis (Batch size)
* This function should be called for each input tensor to concatenate. The argument offset_w
* will be used to store the input tensor in the correct position in the output tensor
*
* i.e. offset_w = 0
* for(i = 0 i < num_input_tensors; ++i)
* {
* arm_concatenation_s8_w(&input[i], ..., &output, ..., ..., offset_w)
* offset_w += input_w[i]
* }
*
* This function assumes that the output tensor has:
* -# The same width of the input tensor
* -# The same height of the input tensor
* -# The same number o channels of the input tensor
*
* Unless specified otherwise, arguments are mandatory.
*
* @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
* does not involve any arithmetic operation
*
* @param[in] input Pointer to input tensor
* @param[in] input_x Width of input tensor
* @param[in] input_y Height of input tensor
* @param[in] input_z Channels in input tensor
* @param[in] input_w Batch size in input tensor
* @param[out] output Pointer to output tensor. Expected to be at least
* input_x * input_y * input_z * input_w
* bytes.
* @param[in] offset_w The offset on the W axis to start concatenating the input tensor
* It is user responsibility to provide the correct value
*
*/
void arm_concatenation_s8_w(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint32_t offset_w);
/**
* @defgroup SVDF SVDF Layer Functions
*
*/
/**
* @brief s8 SVDF function with 8 bit state tensor and 8 bit time weights
*
* @param[in] input_ctx Temporary scratch buffer
* @param[in] output_ctx Temporary output scratch buffer
* @param[in] svdf_params SVDF Parameters
* Range of svdf_params->input_offset : [-128, 127]
* Range of svdf_params->output_offset : [-128, 127]
* @param[in] input_quant_params Input quantization parameters
* @param[in] output_quant_params Output quantization parameters
* @param[in] input_dims Input tensor dimensions
* @param[in] input_data Pointer to input tensor
* @param[in] state_dims State tensor dimensions
* @param[in] state_data Pointer to state tensor
* @param[in] weights_feature_dims Weights (feature) tensor dimensions
* @param[in] weights_feature_data Pointer to the weights (feature) tensor
* @param[in] weights_time_dims Weights (time) tensor dimensions
* @param[in] weights_time_data Pointer to the weights (time) tensor
* @param[in] bias_dims Bias tensor dimensions
* @param[in] bias_data Pointer to bias tensor
* @param[in] output_dims Output tensor dimensions
* @param[out] output_data Pointer to the output tensor
*
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
* 1. Supported framework: TensorFlow Lite micro
* 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
*
*/
arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
const cmsis_nn_context *output_ctx,
const cmsis_nn_svdf_params *svdf_params,
const cmsis_nn_per_tensor_quant_params *input_quant_params,
const cmsis_nn_per_tensor_quant_params *output_quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *state_dims,
q7_t *state_data,
const cmsis_nn_dims *weights_feature_dims,
const q7_t *weights_feature_data,
const cmsis_nn_dims *weights_time_dims,
const q7_t *weights_time_data,
const cmsis_nn_dims *bias_dims,
const q31_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data);
/**
* @brief s8 SVDF function with 16 bit state tensor and 16 bit time weights
*
* @param[in] input_ctx Temporary scratch buffer
* @param[in] output_ctx Temporary output scratch buffer
* @param[in] svdf_params SVDF Parameters
* Range of svdf_params->input_offset : [-128, 127]
* Range of svdf_params->output_offset : [-128, 127]
* @param[in] input_quant_params Input quantization parameters
* @param[in] output_quant_params Output quantization parameters
* @param[in] input_dims Input tensor dimensions
* @param[in] input_data Pointer to input tensor
* @param[in] state_dims State tensor dimensions
* @param[in] state_data Pointer to state tensor
* @param[in] weights_feature_dims Weights (feature) tensor dimensions
* @param[in] weights_feature_data Pointer to the weights (feature) tensor
* @param[in] weights_time_dims Weights (time) tensor dimensions
* @param[in] weights_time_data Pointer to the weights (time) tensor
* @param[in] bias_dims Bias tensor dimensions
* @param[in] bias_data Pointer to bias tensor
* @param[in] output_dims Output tensor dimensions
* @param[out] output_data Pointer to the output tensor
*
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
* 1. Supported framework: TensorFlow Lite micro
* 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
*
*/
arm_status arm_svdf_state_s16_s8(const cmsis_nn_context *input_ctx,
const cmsis_nn_context *output_ctx,
const cmsis_nn_svdf_params *svdf_params,
const cmsis_nn_per_tensor_quant_params *input_quant_params,
const cmsis_nn_per_tensor_quant_params *output_quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *state_dims,
q15_t *state_data,
const cmsis_nn_dims *weights_feature_dims,
const q7_t *weights_feature_data,
const cmsis_nn_dims *weights_time_dims,
const q15_t *weights_time_data,
const cmsis_nn_dims *bias_dims,
const q31_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,1186 @@
/*
* Copyright (C) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
* $Date: 19. April 2022
* $Revision: V.7.0.1
*
* Target Processor: Cortex-M CPUs
* -------------------------------------------------------------------- */
#ifndef _ARM_NNSUPPORTFUNCTIONS_H_
#define _ARM_NNSUPPORTFUNCTIONS_H_
#include "arm_nn_math_types.h"
#include "arm_nn_types.h"
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
#define LEFT_SHIFT(_shift) (_shift > 0 ? _shift : 0)
#define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift)
#define MASK_IF_ZERO(x) (x) == 0 ? ~0 : 0
#define MASK_IF_NON_ZERO(x) (x) != 0 ? ~0 : 0
#define SELECT_USING_MASK(mask, a, b) ((mask) & (a)) ^ (~(mask) & (b))
#define MAX(A, B) ((A) > (B) ? (A) : (B))
#define MIN(A, B) ((A) < (B) ? (A) : (B))
#define CLAMP(x, h, l) MAX(MIN((x), (h)), (l))
#define REDUCE_MULTIPLIER(_mult) ((_mult < 0x7FFF0000) ? ((_mult + (1 << 15)) >> 16) : 0x7FFF)
/**
* @brief definition to pack four 8 bit values.
*/
#define PACK_Q7x4_32x1(v0, v1, v2, v3) \
((((int32_t)(v0) << 0) & (int32_t)0x000000FF) | (((int32_t)(v1) << 8) & (int32_t)0x0000FF00) | \
(((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | (((int32_t)(v3) << 24) & (int32_t)0xFF000000))
/**
* @brief Union for SIMD access of q31/q15/q7 types
*/
union arm_nnword
{
q31_t word;
/**< q31 type */
q15_t half_words[2];
/**< q15 type */
q7_t bytes[4];
/**< q7 type */
};
/**
* @brief Union for data type long long
*/
struct arm_nn_double
{
uint32_t low;
int32_t high;
};
union arm_nn_long_long
{
int64_t long_long;
struct arm_nn_double word;
};
/**
* @defgroup nndata_convert Neural Network Data Conversion Functions
*
* Perform data type conversion in-between neural network operations
*
*/
/**
* @brief Converts the elements of the q7 vector to q15 vector without left-shift
* @param[in] *pSrc points to the q7 input vector
* @param[out] *pDst points to the q15 output vector
* @param[in] blockSize length of the input vector
*
*/
void arm_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize);
/**
* @brief Non-saturating addition of elements of a q7 vector
* @param[in] *input Pointer to the q7 input vector
* @param[out] *output Pointer to the q31 output variable.
* @param[in] block_size length of the input vector
* \par Description:
*
* 2^24 samples can be added without saturating the result.
*
* The equation used for the conversion process is:
*
* <pre>
* sum = input[0] + input[1] + .. + input[block_size -1]
* </pre>
*
* */
void arm_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size);
/**
* @brief Converts the elements of the q7 vector to reordered q15 vector without left-shift
* @param[in] *pSrc points to the q7 input vector
* @param[out] *pDst points to the q15 output vector
* @param[in] blockSize length of the input vector
* @return none.
*
*/
void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize);
/**
* @brief Converts the elements from a q7 vector to a q15 vector with an added offset
* @param[in] src pointer to the q7 input vector
* @param[out] dst pointer to the q15 output vector
* @param[in] block_size length of the input vector
* @param[in] offset q7 offset to be added to each input vector element.
*
* \par Description:
*
* The equation used for the conversion process is:
*
* <pre>
* dst[n] = (q15_t) src[n] + offset; 0 <= n < block_size.
* </pre>
*
*/
void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset);
/**
* @brief Converts the elements of the q7 vector to reordered q15 vector with an added offset
* @param[in] src pointer to the q7 input vector
* @param[out] dst pointer to the q15 output vector
* @param[in] block_size length of the input vector
* @param[in] offset offset to be added to each input vector element.
* @return none.
*
* @details This function does the q7 to q15 expansion with re-ordering of bytes. Re-ordering is a consequence of
* the sign extension intrinsic(DSP extension). The tail (i.e., last (N % 4) elements) retains its
* original order.
*
*/
void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset);
/**
* @brief Converts the elements from a q7 vector and accumulate to a q15 vector
* @param[in] *src points to the q7 input vector
* @param[out] *dst points to the q15 output vector
* @param[in] block_size length of the input vector
*
* \par Description:
*
* The equation used for the conversion process is:
*
* <pre>
* dst[n] += (q15_t) src[n] ; 0 <= n < block_size.
* </pre>
*
*/
void arm_nn_accumulate_q7_to_q15(q15_t *dst, const q7_t *src, uint32_t block_size);
/**
* @brief Depthwise conv on an im2col buffer where the input channel equals output channel.
* @param[in] row pointer to row
* @param[in] col pointer to im2col buffer, always consists of 2 columns.
* @param[in] num_ch number of channels
* @param[in] out_shift pointer to per output channel requantization shift parameter.
* @param[in] out_mult pointer to per output channel requantization multiplier parameter.
* @param[in] out_offset output tensor offset.
* @param[in] activation_min minimum value to clamp the output to. Range : int8
* @param[in] activation_max maximum value to clamp the output to. Range : int8
* @param[in] kernel_size number of elements in one column.
* @param[in] output_bias per output channel bias. Range : int32
* @param[out] out pointer to output
* @return The function returns one of the two
* 1. The incremented output pointer for a successful operation or
* 2. NULL if implementation is not available.
*
* @details Supported framework: TensorFlow Lite micro.
*/
q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
const q15_t *col,
const uint16_t num_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t kernel_size,
const int32_t *const output_bias,
q7_t *out);
/**
* @brief General Matrix-multiplication function with per-channel requantization.
* @param[in] input_row pointer to row operand
* @param[in] input_col pointer to col operand
* @param[in] output_ch number of rows of input_row
* @param[in] col_batches number of column batches. Range: 1 to 4
* @param[in] output_shift pointer to per output channel requantization shift parameter.
* @param[in] output_mult pointer to per output channel requantization multiplier parameter.
* @param[in] out_offset output tensor offset.
* @param[in] col_offset input tensor(col) offset.
* @param[in] row_offset kernel offset(row). Not used.
* @param[in] out_activation_min minimum value to clamp the output to. Range : int8
* @param[in] out_activation_max maximum value to clamp the output to. Range : int8
* @param[in] row_len number of elements in each row
* @param[in] bias per output channel bias. Range : int32
* @param[in,out] out pointer to output
* @return The function returns one of the two
* 1. The incremented output pointer for a successful operation or
* 2. NULL if implementation is not available.
*
* @details Supported framework: TensorFlow Lite
*/
q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
const q7_t *input_col,
const uint16_t output_ch,
const uint16_t col_batches,
const int32_t *output_shift,
const int32_t *output_mult,
const int32_t out_offset,
const int32_t col_offset,
const int32_t row_offset,
const int16_t out_activation_min,
const int16_t out_activation_max,
const uint16_t row_len,
const int32_t *const bias,
q7_t *out);
/**
* @brief Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution.
* @param[in] input_a pointer to operand A
* @param[in] input_b pointer to operand B, always consists of 2 vectors.
* @param[in] output_ch number of rows of A
* @param[in] out_shift pointer to per output channel requantization shift parameter.
* @param[in] out_mult pointer to per output channel requantization multiplier parameter.
* @param[in] activation_min minimum value to clamp the output to. Range : int16
* @param[in] activation_max maximum value to clamp the output to. Range : int16
* @param[in] num_col_a number of columns of A
* @param[in] output_bias per output channel bias. Range : int64
* @param[in,out] out_0 pointer to output
* @return The function returns one of the two
* 1. The incremented output pointer for a successful operation or
* 2. NULL if implementation is not available.
*
* @details This function does the matrix multiplication of weight matrix for all output channels
* with 2 columns from im2col and produces two elements/output_channel. The outputs are
* clamped in the range provided by activation min and max.
* Supported framework: TensorFlow Lite micro.
*/
q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
const q15_t *input_b,
const int32_t output_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int16_t activation_min,
const int16_t activation_max,
const int32_t num_col_a,
const int64_t *const output_bias,
q15_t *out_0);
/**
* @brief General Matrix-multiplication without requantization for one row & one column
* @param[in] row_elements number of row elements
* @param[in] row_base pointer to row operand
* @param[in] col_base pointer to col operand
* @param[out] sum_col pointer to store sum of column elements
* @param[out] output pointer to store result of multiply-accumulate
* @return The function returns the multiply-accumulated result of the row by column.
*
* @details Pseudo-code
* *output = 0
* sum_col = 0
* for (i = 0; i < row_elements; i++)
* *output += row_base[i] * col_base[i]
* sum_col += col_base[i]
*
*/
arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
const int8_t *row_base,
const int8_t *col_base,
int32_t *const sum_col,
int32_t *const output);
/**
* @brief Matrix-multiplication with requantization & activation function for four rows and one column
* @param[in] row_elements number of row elements
* @param[in] offset offset between rows. Can be the same as row_elements.
* For e.g, in a 1x1 conv scenario with stride as 1.
* @param[in] row_base pointer to row operand
* @param[in] col_base pointer to col operand
* @param[in] out_ch Number of output channels
* @param[in] conv_params Pointer to convolution parameters like offsets and activation values
* @param[in] quant_params Pointer to per-channel quantization parameters
* @param[in] bias Pointer to per-channel bias
* @param[out] output Pointer to output where int8 results are stored.
*
* @return The function returns the updated output pointer or NULL if implementation is not available.
*
* @details Compliant to TFLM int8 specification. MVE implementation only
*/
int8_t *arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
const int32_t offset,
const int8_t *row_base,
const int8_t *col_base,
const int32_t out_ch,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const int32_t *bias,
int8_t *output);
/**
* @brief General Matrix-multiplication function with per-channel requantization.
* This function assumes:
* - LHS input matrix NOT transposed (nt)
* - RHS input matrix transposed (t)
*
* @note This operation also performs the broadcast bias addition before the requantization
*
* @param[in] lhs Pointer to the LHS input matrix
* @param[in] rhs Pointer to the RHS input matrix
* @param[in] bias Pointer to the bias vector. The length of this vector is equal to the number of
* output columns (or RHS input rows)
* @param[out] dst Pointer to the output matrix with "m" rows and "n" columns
* @param[in] dst_multipliers Pointer to the multipliers vector needed for the per-channel requantization.
* The length of this vector is equal to the number of output columns (or RHS input
* rows)
* @param[in] dst_shifts Pointer to the shifts vector needed for the per-channel requantization. The length
* of this vector is equal to the number of output columns (or RHS input rows)
* @param[in] lhs_rows Number of LHS input rows
* @param[in] rhs_rows Number of RHS input rows
* @param[in] rhs_cols Number of LHS/RHS input columns
* @param[in] lhs_offset Offset to be applied to the LHS input value
* @param[in] dst_offset Offset to be applied the output result
* @param[in] activation_min Minimum value to clamp down the output. Range : int8
* @param[in] activation_max Maximum value to clamp up the output. Range : int8
*
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
const q7_t *rhs,
const q31_t *bias,
q7_t *dst,
const int32_t *dst_multipliers,
const int32_t *dst_shifts,
const int32_t lhs_rows,
const int32_t rhs_rows,
const int32_t rhs_cols,
const int32_t lhs_offset,
const int32_t dst_offset,
const int32_t activation_min,
const int32_t activation_max);
/**
* @brief s8 Vector by Matrix (transposed) multiplication
*
* @param[in] lhs Input left-hand side vector
* @param[in] rhs Input right-hand side matrix (transposed)
* @param[in] bias Input bias
* @param[out] dst Output vector
* @param[in] lhs_offset Offset to be added to the input values of the left-hand side vector.
* Range: -127 to 128
* @param[in] rhs_offset Not used
* @param[in] dst_offset Offset to be added to the output values. Range: -127 to 128
* @param[in] dst_multiplier Output multiplier
* @param[in] dst_shift Output shift
* @param[in] rhs_cols Number of columns in the right-hand side input matrix
* @param[in] rhs_rows Number of rows in the right-hand side input matrix
* @param[in] activation_min Minimum value to clamp the output to. Range: int8
* @param[in] activation_max Maximum value to clamp the output to. Range: int8
* @param[in] address_offset Memory position offset for dst. First output is stored at 'dst', the
* second at 'dst + address_offset' and so on. Default value is typically 1.
*
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
const q7_t *rhs,
const q31_t *bias,
q7_t *dst,
const int32_t lhs_offset,
const int32_t rhs_offset,
const int32_t dst_offset,
const int32_t dst_multiplier,
const int32_t dst_shift,
const int32_t rhs_cols,
const int32_t rhs_rows,
const int32_t activation_min,
const int32_t activation_max,
const int32_t address_offset);
/**
* @brief s16 Vector by Matrix (transposed) multiplication
*
* @param[in] lhs Input left-hand side vector
* @param[in] rhs Input right-hand side matrix (transposed)
* @param[in] bias Input bias
* @param[out] dst Output vector
* @param[in] dst_multiplier Output multiplier
* @param[in] dst_shift Output shift
* @param[in] rhs_cols Number of columns in the right-hand side input matrix
* @param[in] rhs_rows Number of rows in the right-hand side input matrix
* @param[in] activation_min Minimum value to clamp the output to. Range: int16
* @param[in] activation_max Maximum value to clamp the output to. Range: int16
*
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_nn_vec_mat_mult_t_s16(const q15_t *lhs,
const q7_t *rhs,
const q63_t *bias,
q15_t *dst,
const int32_t dst_multiplier,
const int32_t dst_shift,
const int32_t rhs_cols,
const int32_t rhs_rows,
const int32_t activation_min,
const int32_t activation_max);
/**
* @brief s8 Vector by Matrix (transposed) multiplication with s16 output
*
* @param[in] lhs Input left-hand side vector
* @param[in] rhs Input right-hand side matrix (transposed)
* @param[out] dst Output vector
* @param[in] lhs_offset Offset to be added to the input values of the left-hand side
* vector. Range: -127 to 128
* @param[in] rhs_offset Not used
* @param[in] scatter_offset Address offset for dst. First output is stored at 'dst', the
* second at 'dst + scatter_offset' and so on.
* @param[in] dst_multiplier Output multiplier
* @param[in] dst_shift Output shift
* @param[in] rhs_cols Number of columns in the right-hand side input matrix
* @param[in] rhs_rows Number of rows in the right-hand side input matrix
* @param[in] activation_min Minimum value to clamp the output to. Range: int16
* @param[in] activation_max Maximum value to clamp the output to. Range: int16
*
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
*/
arm_status arm_nn_vec_mat_mult_t_svdf_s8(const q7_t *lhs,
const q7_t *rhs,
q15_t *dst,
const int32_t lhs_offset,
const int32_t rhs_offset,
const int32_t scatter_offset,
const int32_t dst_multiplier,
const int32_t dst_shift,
const int32_t rhs_cols,
const int32_t rhs_rows,
const int32_t activation_min,
const int32_t activation_max);
/**
* @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where
* the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs.
*
* @param[in] lhs Input left-hand side matrix
* @param[in] rhs Input right-hand side matrix (transposed)
* @param[in] lhs_offset LHS matrix offset(input offset). Range: -127 to 128
* @param[in] num_ch Number of channels in LHS/RHS
* @param[in] out_shift Per channel output shift. Length of vector is equal to number of channels
* @param[in] out_mult Per channel output multiplier. Length of vector is equal to number of channels
* @param[in] out_offset Offset to be added to the output values. Range: -127 to 128
* @param[in] activation_min Minimum value to clamp the output to. Range: int8
* @param[in] activation_max Maximum value to clamp the output to. Range: int8
* @param[in] row_x_col (row_dimension * col_dimension) of LHS/RHS matrix
* @param[in] output_bias Per channel output bias. Length of vector is equal to number of channels
* @param[in] out Output pointer
*
* @return The function returns one of the two
* - Updated output pointer if an implementation is available
* - NULL if no implementation is available.
*
* @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
* out for the following.
* - Output shift
* - Output multiplier
* - Output bias
* - rhs
*/
q7_t *arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs,
const q7_t *rhs,
const int32_t lhs_offset,
const uint16_t num_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t row_x_col,
const int32_t *const output_bias,
q7_t *out);
/**
* @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases.
* Dimensions are the same for lhs and rhs.
*
* @param[in] lhs Input left-hand side matrix
* @param[in] rhs Input right-hand side matrix (transposed)
* @param[in] lhs_offset LHS matrix offset(input offset). Range: -127 to 128
* @param[in] num_ch Number of channels in LHS/RHS
* @param[in] out_shift Per channel output shift. Length of vector is equal to number of channels.
* @param[in] out_mult Per channel output multiplier. Length of vector is equal to number of channels.
* @param[in] out_offset Offset to be added to the output values. Range: -127 to 128
* @param[in] activation_min Minimum value to clamp the output to. Range: int8
* @param[in] activation_max Maximum value to clamp the output to. Range: int8
* @param[in] row_x_col (row_dimension * col_dimension) of LHS/RHS matrix
* @param[in] output_bias Per channel output bias. Length of vector is equal to number of channels.
* @param[in] out Output pointer
*
* @return The function returns one of the two
* - Updated output pointer if an implementation is available
* - NULL if no implementation is available.
*
* @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
* out for the following.
* - Output shift
* - Output multiplier
* - Output bias
* - rhs
*/
q7_t *arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs,
const q7_t *rhs,
const int32_t lhs_offset,
const uint16_t num_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t row_x_col,
const int32_t *const output_bias,
q7_t *out);
/**
*@brief Matrix-multiplication function for convolution with reordered columns
*@param[in] pA pointer to operand A
*@param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
*@param[in] ch_im_out numRow of A
*@param[in] numCol_A numCol of A
*@param[in] bias_shift amount of left-shift for bias
*@param[in] out_shift amount of right-shift for output
*@param[in] bias the bias
*@param[in,out] pOut pointer to output
*@return The function returns the incremented output pointer
*
*@details This function assumes that data in pInBuffer are reordered
*/
q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA,
const q15_t *pInBuffer,
const uint16_t ch_im_out,
const uint16_t numCol_A,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q7_t *pOut);
/**
@brief Read 2 q15 elements and post increment pointer.
@param[in] in_q15 Pointer to pointer that holds address of input.
@return q31 value
*/
__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia(const q15_t **in_q15)
{
q31_t val;
memcpy(&val, *in_q15, 4);
*in_q15 += 2;
return (val);
}
/**
@brief Read 4 q7 from q7 pointer and post increment pointer.
@param[in] in_q7 Pointer to pointer that holds address of input.
@return q31 value
*/
__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia(const q7_t **in_q7)
{
q31_t val;
memcpy(&val, *in_q7, 4);
*in_q7 += 4;
return (val);
}
/**
@brief Read 2 q15 from q15 pointer.
@param[in] in_q15 pointer to address of input.
@return q31 value
*/
__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2(const q15_t *in_q15)
{
q31_t val;
memcpy(&val, in_q15, 4);
return (val);
}
/**
@brief Read 4 q7 values.
@param[in] in_q7 pointer to address of input.
@return q31 value
*/
__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4(const q7_t *in_q7)
{
q31_t val;
memcpy(&val, in_q7, 4);
return (val);
}
/**
@brief Write four q7 to q7 pointer and increment pointer afterwards.
@param[in] in Double pointer to input value
@param[in] value Four bytes to copy
*/
__STATIC_FORCEINLINE void arm_nn_write_q7x4_ia(q7_t **in, q31_t value)
{
memcpy(*in, &value, 4);
*in += 4;
}
/**
* @brief memset optimized for MVE
* @param[in, out] dst Destination pointer
* @param[in] val Value to set
* @param[in] block_size Number of bytes to copy.
*
*/
__STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst, const q7_t val, uint32_t block_size)
{
#if defined(ARM_MATH_MVEI)
__asm volatile(" vdup.8 q0, %[set_val] \n"
" wlstp.8 lr, %[cnt], 1f \n"
"2: \n"
" vstrb.8 q0, [%[in]], #16 \n"
" letp lr, 2b \n"
"1: \n"
: [ in ] "+r"(dst)
: [ cnt ] "r"(block_size), [ set_val ] "r"(val)
: "q0", "memory", "r14");
#else
memset(dst, val, block_size);
#endif
}
#if defined(ARM_MATH_DSP)
/**
* @brief read and expand one q7 word into two q15 words
*/
__STATIC_FORCEINLINE const q7_t *read_and_pad(const q7_t *source, q31_t *out1, q31_t *out2)
{
q31_t inA = arm_nn_read_q7x4_ia(&source);
q31_t inAbuf1 = __SXTB16_RORn((uint32_t)inA, 8);
q31_t inAbuf2 = __SXTB16(inA);
#ifndef ARM_MATH_BIG_ENDIAN
*out2 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16));
*out1 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16));
#else
*out1 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16));
*out2 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16));
#endif
return source;
}
/**
* @brief read and expand one q7 word into two q15 words with reordering
*/
__STATIC_FORCEINLINE const q7_t *read_and_pad_reordered(const q7_t *source, q31_t *out1, q31_t *out2)
{
q31_t inA = arm_nn_read_q7x4_ia(&source);
#ifndef ARM_MATH_BIG_ENDIAN
*out2 = __SXTB16(__ROR((uint32_t)inA, 8));
*out1 = __SXTB16(inA);
#else
*out1 = __SXTB16(__ROR((uint32_t)inA, 8));
*out2 = __SXTB16(inA);
#endif
return source;
}
/**
* @brief read and expand one q7 word into two q15 words with reordering and add an offset
*/
__STATIC_FORCEINLINE const q7_t *
read_and_pad_reordered_with_offset(const q7_t *source, q31_t *out1, q31_t *out2, q31_t offset)
{
q31_t inA = arm_nn_read_q7x4_ia(&source);
#ifndef ARM_MATH_BIG_ENDIAN
*out2 = __SXTB16(__ROR((uint32_t)inA, 8));
*out1 = __SXTB16(inA);
#else
*out1 = __SXTB16(__ROR((uint32_t)inA, 8));
*out2 = __SXTB16(inA);
#endif
*out1 = __QADD16(*out1, offset);
*out2 = __QADD16(*out2, offset);
return source;
}
#endif
/**
* @defgroup NNBasicMath Basic Math Functions for Neural Network Computation
*
* Basic Math Functions for Neural Network Computation
*
*/
/**
* @brief q7 vector multiplication with variable output shifts
* @param[in] *pSrcA pointer to the first input vector
* @param[in] *pSrcB pointer to the second input vector
* @param[out] *pDst pointer to the output vector
* @param[in] out_shift amount of right-shift for output
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable q15 range [0x8000 0x7FFF] will be saturated.
*/
void arm_nn_mult_q15(q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize);
/**
* @brief q7 vector multiplication with variable output shifts
* @param[in] *pSrcA pointer to the first input vector
* @param[in] *pSrcB pointer to the second input vector
* @param[out] *pDst pointer to the output vector
* @param[in] out_shift amount of right-shift for output
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable q7 range [0x80 0x7F] will be saturated.
*/
void arm_nn_mult_q7(q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize);
/**
* @brief Matrix-multiplication function for convolution with per-channel requantization.
* @param[in] input_a pointer to operand A
* @param[in] input_b pointer to operand B, always consists of 2 vectors.
* @param[in] output_ch number of rows of A
* @param[in] out_shift pointer to per output channel requantization shift parameter.
* @param[in] out_mult pointer to per output channel requantization multiplier parameter.
* @param[in] out_offset output tensor offset.
* @param[in] activation_min minimum value to clamp the output to. Range : int8
* @param[in] activation_max maximum value to clamp the output to. Range : int8
* @param[in] num_col_a number of columns of A
* @param[in] output_bias per output channel bias. Range : int32
* @param[in,out] out_0 pointer to output
* @return The function returns one of the two
* 1. The incremented output pointer for a successful operation or
* 2. NULL if implementation is not available.
*
* @details This function does the matrix multiplication of weight matrix for all output channels
* with 2 columns from im2col and produces two elements/output_channel. The outputs are
* clamped in the range provided by activation min and max.
* Supported framework: TensorFlow Lite micro.
*/
q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
const q15_t *input_b,
const uint16_t output_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int16_t activation_min,
const int16_t activation_max,
const uint16_t num_col_a,
const int32_t *const output_bias,
q7_t *out_0);
/**
* @brief Common softmax function for s8 input and s8 or s16 output
* @param[in] input Pointer to the input tensor
* @param[in] num_rows Number of rows in the input tensor
* @param[in] row_size Number of elements in each input row
* @param[in] mult Input quantization multiplier
* @param[in] shift Input quantization shift within the range [0, 31]
* @param[in] diff_min Minimum difference with max in row. Used to check if
* the quantized exponential operation can be performed
* @param[in] int16_output Indicating s8 output if 0 else s16 output
* @param[out] output Pointer to the output tensor
*
* @note Supported framework: TensorFlow Lite micro (bit-accurate)
*
*/
void arm_nn_softmax_common_s8(const int8_t *input,
const int32_t num_rows,
const int32_t row_size,
const int32_t mult,
const int32_t shift,
const int32_t diff_min,
const bool int16_output,
void *output);
/**
* @brief macro for adding rounding offset
*/
#ifndef ARM_NN_TRUNCATE
#define NN_ROUND(out_shift) ((0x1 << out_shift) >> 1)
#else
#define NN_ROUND(out_shift) 0
#endif
// Macros for shortening quantization functions' names and avoid long lines
#define MUL_SAT(a, b) arm_nn_doubling_high_mult((a), (b))
#define MUL_SAT_MVE(a, b) arm_doubling_high_mult_mve_32x4((a), (b))
#define MUL_POW2(a, b) arm_nn_mult_by_power_of_two((a), (b))
#define DIV_POW2(a, b) arm_nn_divide_by_power_of_two((a), (b))
#define DIV_POW2_MVE(a, b) arm_divide_by_power_of_two_mve((a), (b))
#define EXP_ON_NEG(x) arm_nn_exp_on_negative_values((x))
#define ONE_OVER1(x) arm_nn_one_over_one_plus_x_for_x_in_0_1((x))
/**
* @brief Saturating doubling high multiply. Result matches
* NEON instruction VQRDMULH.
* @param[in] m1 Multiplicand. Range: {NN_Q31_MIN, NN_Q31_MAX}
* @param[in] m2 Multiplier. Range: {NN_Q31_MIN, NN_Q31_MAX}
* @return Result of multiplication.
*
*/
__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult(const q31_t m1, const q31_t m2)
{
q31_t result = 0;
// Rounding offset to add for a right shift of 31
q63_t mult = 1 << 30;
if ((m1 < 0) ^ (m2 < 0))
{
mult = 1 - mult;
}
// Gets resolved as a SMLAL instruction
mult = mult + (q63_t)m1 * m2;
// Utilize all of the upper 32 bits. This is the doubling step
// as well.
result = (int32_t)(mult / (1ll << 31));
if ((m1 == m2) && (m1 == (int32_t)NN_Q31_MIN))
{
result = NN_Q31_MAX;
}
return result;
}
/**
* @brief Doubling high multiply without saturation. This is intended
* for requantization where the scale is a positive integer
*
* @param[in] m1 Multiplicand. Range: {NN_Q31_MIN, NN_Q31_MAX}
* @param[in] m2 Multiplier Range: {NN_Q31_MIN, NN_Q31_MAX}
* @return Result of multiplication.
* @note The result of this matches that of neon instruction
* VQRDMULH for m1 in range {NN_Q31_MIN, NN_Q31_MAX} and m2 in
* range {NN_Q31_MIN + 1, NN_Q31_MAX}. Saturation occurs when
* m1 equals m2 equals NN_Q31_MIN and that is not handled by
* this function.
*
*/
__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat(const q31_t m1, const q31_t m2)
{
q31_t result = 0;
union arm_nn_long_long mult;
// Rounding offset to add for a right shift of 31
mult.word.low = 1 << 30;
mult.word.high = 0;
// Gets resolved as a SMLAL instruction
mult.long_long = mult.long_long + (q63_t)m1 * m2;
// Utilize all of the upper 32 bits. This is the doubling step
// as well.
result = (int32_t)(mult.long_long >> 31);
return result;
}
/**
* @brief Rounding divide by power of two.
* @param[in] dividend - Dividend
* @param[in] exponent - Divisor = power(2, exponent)
* Range: [0, 31]
* @return Rounded result of division. Midpoint is rounded away from zero.
*
*/
__STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, const q31_t exponent)
{
q31_t result = 0;
const q31_t remainder_mask = (1 << exponent) - 1;
int32_t remainder = remainder_mask & dividend;
// Basic division
result = dividend >> exponent;
// Adjust 'result' for rounding (mid point away from zero)
q31_t threshold = remainder_mask >> 1;
if (result < 0)
{
threshold++;
}
if (remainder > threshold)
{
result++;
}
return result;
}
/**
* @brief Requantize a given value.
* @param[in] val Value to be requantized
* @param[in] multiplier multiplier. Range {NN_Q31_MIN + 1, Q32_MAX}
* @param[in] shift left or right shift for 'val * multiplier'
*
* @return Returns (val * multiplier)/(2 ^ shift)
*
*/
__STATIC_FORCEINLINE q31_t arm_nn_requantize(const q31_t val, const q31_t multiplier, const q31_t shift)
{
#ifdef CMSIS_NN_USE_SINGLE_ROUNDING
const int64_t total_shift = 31 - shift;
const int64_t new_val = val * (int64_t)multiplier;
int32_t result = new_val >> (total_shift - 1);
result = (result + 1) >> 1;
return result;
#else
return arm_nn_divide_by_power_of_two(arm_nn_doubling_high_mult_no_sat(val * (1 << LEFT_SHIFT(shift)), multiplier),
RIGHT_SHIFT(shift));
#endif
}
/**
* @brief Requantize a given 64 bit value.
* @param[in] val Value to be requantized in the range {-(1<<47)} to {(1<<47) - 1}
* @param[in] reduced_multiplier Reduced multiplier in the range {NN_Q31_MIN + 1, Q32_MAX} to {Q16_MIN + 1,
* Q16_MAX}
* @param[in] shift Left or right shift for 'val * multiplier' in the range {-31} to {7}
*
* @return Returns (val * multiplier)/(2 ^ shift)
*
*/
__STATIC_FORCEINLINE q31_t arm_nn_requantize_s64(const q63_t val, const q31_t reduced_multiplier, const q31_t shift)
{
const q63_t new_val = val * reduced_multiplier;
q31_t result = new_val >> (14 - shift); // 64->32 bit reduction
result = (result + 1) >> 1; // Last shift position and insert round
return result;
}
/**
* @brief memcpy optimized for MVE
* @param[in, out] dst Destination pointer
* @param[in] src Source pointer.
* @param[in] block_size Number of bytes to copy.
*
*/
__STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst, const q7_t *__RESTRICT src, uint32_t block_size)
{
#if defined(ARM_MATH_MVEI)
__asm volatile(" wlstp.8 lr, %[cnt], 1f \n"
"2: \n"
" vldrb.8 q0, [%[in]], #16 \n"
" vstrb.8 q0, [%[out]], #16 \n"
" letp lr, 2b \n"
"1: \n"
: [ in ] "+r"(src), [ out ] "+r"(dst)
: [ cnt ] "r"(block_size)
: "q0", "memory", "r14");
#else
memcpy(dst, src, block_size);
#endif
}
#if defined(ARM_MATH_MVEI)
/**
* @brief Vector saturating doubling high multiply returning high half.
* @param[in] m1 Multiplicand
* @param[in] m2 Multiplier
* @return Result of multiplication.
*
*/
__STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve(const int32x4_t m1, const q31_t m2)
{
return vqrdmulhq_n_s32(m1, m2);
}
/**
* @brief Vector rounding divide by power of two.
* @param[in] dividend - Dividend vector
* @param[in] exponent - Divisor = power(2, exponent)
* Range: [0, 31]
* @return Rounded result of division. Midpoint is rounded away from zero.
*
*/
__STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve(const int32x4_t dividend, const q31_t exponent)
{
const int32x4_t shift = vdupq_n_s32(-exponent);
const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31);
const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup);
return vrshlq_s32(fixed_up_dividend, shift);
}
/**
* @brief Requantize a given vector.
* @param[in] val Vector to be requantized
* @param[in] multiplier multiplier
* @param[in] shift shift
*
* @return Returns (val * multiplier)/(2 ^ shift)
*
*/
__STATIC_FORCEINLINE int32x4_t arm_requantize_mve(const int32x4_t val, const q31_t multiplier, const q31_t shift)
{
#ifdef CMSIS_NN_USE_SINGLE_ROUNDING
const int right_shift = MIN(-1, shift);
const int left_shift = shift - right_shift;
const int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
const int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
int32x4_t result = vqdmulhq_n_s32(vshlq_s32(val, left_shift_dup), multiplier);
result = vrshlq_s32(result, right_shift_dup);
return result;
#else
return arm_divide_by_power_of_two_mve(
arm_doubling_high_mult_mve(vshlq_s32(val, vdupq_n_s32(LEFT_SHIFT(shift))), multiplier), RIGHT_SHIFT(shift));
#endif
}
__STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve_32x4(const int32x4_t m1, const int32x4_t m2)
{
return vqrdmulhq_s32(m1, m2);
}
__STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve_32x4(const int32x4_t dividend, const int32x4_t exponent)
{
const int32x4_t shift = -exponent;
const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31);
const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup);
return vrshlq_s32(fixed_up_dividend, shift);
}
__STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val,
const int32x4_t multiplier,
const int32x4_t shift)
{
#ifdef CMSIS_NN_USE_SINGLE_ROUNDING
const int32x4_t right_shift = vminq_s32(vdupq_n_s32(-1), shift);
const int32x4_t left_shift = vqsubq_s32(shift, right_shift);
int32x4_t result = vqdmulhq_s32(vshlq_s32(val, left_shift), multiplier);
result = vrshlq_s32(result, right_shift);
return result;
#else
const int32x4_t zz = vdupq_n_s32(0);
const mve_pred16_t p = vcmpgtq_n_s32(shift, 0);
const int32x4_t left_shift = vpselq_s32(shift, zz, p);
const int32x4_t right_shift = -vpselq_s32(zz, shift, p);
return arm_divide_by_power_of_two_mve_32x4(arm_doubling_high_mult_mve_32x4(vshlq_s32(val, left_shift), multiplier),
right_shift);
#endif
}
#endif
// @note The following functions are used only for softmax layer, scaled bits = 5 assumed
__STATIC_FORCEINLINE int32_t arm_nn_exp_on_negative_values(int32_t val)
{
int32_t mask = 0;
int32_t shift = 24;
const int32_t val_mod_minus_quarter = (val & ((1 << shift) - 1)) - (1 << shift);
const int32_t remainder = val_mod_minus_quarter - val;
const int32_t x = (val_mod_minus_quarter << 5) + (1 << 28);
const int32_t x2 = MUL_SAT(x, x);
int32_t result = 1895147668 +
MUL_SAT(1895147668, x + DIV_POW2(MUL_SAT(DIV_POW2(MUL_SAT(x2, x2), 2) + MUL_SAT(x2, x), 715827883) + x2, 1));
#define SELECT_IF_NON_ZERO(x) \
{ \
mask = MASK_IF_NON_ZERO(remainder & (1 << shift++)); \
result = SELECT_USING_MASK(mask, MUL_SAT(result, x), result); \
}
SELECT_IF_NON_ZERO(1672461947)
SELECT_IF_NON_ZERO(1302514674)
SELECT_IF_NON_ZERO(790015084)
SELECT_IF_NON_ZERO(290630308)
SELECT_IF_NON_ZERO(39332535)
SELECT_IF_NON_ZERO(720401)
SELECT_IF_NON_ZERO(242)
#undef SELECT_IF_NON_ZERO
mask = MASK_IF_ZERO(val);
return SELECT_USING_MASK(mask, NN_Q31_MAX, result);
}
__STATIC_FORCEINLINE q31_t arm_nn_mult_by_power_of_two(const int32_t val, const int32_t exp)
{
const int32_t thresh = ((1 << (31 - exp)) - 1);
int32_t result = val << exp;
result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val > thresh), NN_Q31_MAX, result);
result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val < -thresh), NN_Q31_MIN, result);
return result;
}
__STATIC_FORCEINLINE int32_t arm_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val)
{
const int64_t sum = (int64_t)val + (int64_t)NN_Q31_MAX;
const int32_t half_denominator = (int32_t)((sum + (sum >= 0 ? 1 : -1)) / 2L);
int32_t x = 1515870810 + MUL_SAT(half_denominator, -1010580540);
const int32_t shift = (1 << 29);
x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2);
x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2);
x += MUL_POW2(MUL_SAT(x, shift - MUL_SAT(half_denominator, x)), 2);
return MUL_POW2(x, 1);
}
/**
@brief Write 2 q15 elements and post increment pointer.
@param[in] dest_q15 Pointer to pointer that holds address of destination.
@param[in] src_q31 Input value to be written.
*/
__STATIC_FORCEINLINE void arm_nn_write_q15x2_ia(q15_t **dest_q15, q31_t src_q31)
{
q31_t val = src_q31;
memcpy(*dest_q15, &val, 4);
*dest_q15 += 2;
}
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,30 @@
#
# Copyright (c) 2019-2021 Arm Limited. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
project(CMSISNNActivation)
file(GLOB SRC "./*_s8.c")
add_library(CMSISNNActivation STATIC ${SRC})
### Includes
target_include_directories(CMSISNNActivation PUBLIC "${NN}/Include")
target_include_directories(CMSISNNActivation PUBLIC "${ROOT}/CMSIS/Core/Include")
target_include_directories(CMSISNNActivation PUBLIC "${ROOT}/CMSIS/DSP/Include")

View File

@@ -0,0 +1,96 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_activations_q15.c
* Description: Q15 neural network activation function using direct table look-up
*
* $Date: 09. October 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nn_tables.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief neural network activation function using direct table look-up
*
* @note Refer header file for details.
*
*/
void arm_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type)
{
uint16_t i = size;
q15_t *pIn = data;
q15_t *pOut = data;
uint16_t shift_size = 8 + 3 - int_width;
uint32_t bit_mask = 0x7FF >> int_width;
uint32_t full_frac = bit_mask + 1;
const q15_t *lookup_table;
switch (type)
{
case ARM_SIGMOID:
lookup_table = sigmoidTable_q15;
break;
case ARM_TANH:
default:
lookup_table = tanhTable_q15;
break;
}
while (i)
{
q15_t out;
q15_t in = *pIn++;
q15_t frac = (uint32_t)in & bit_mask;
q15_t value = lookup_table[(uint8_t)(in >> shift_size)];
if ((in >> shift_size) != 0x7f)
{
q15_t value2 = lookup_table[(uint8_t)(1 + ((uint8_t)(in >> shift_size)))];
/* doing the interpolation here for better accuracy */
out = ((q31_t)(full_frac - frac) * value + (q31_t)value2 * frac) >> shift_size;
}
else
{
/* the largest positive value does not have a right side for linear interpolation */
out = value;
}
*pOut++ = out;
i--;
}
}
/**
* @} end of Acti group
*/

View File

@@ -0,0 +1,89 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_activations_q7.c
* Description: Q7 neural network activation function using direct table look-up
*
* $Date: 09. October 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nn_tables.h"
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief Q7 neural network activation function using direct table look-up
* @param[in,out] data pointer to input
* @param[in] size number of elements
* @param[in] int_width bit-width of the integer part, assume to be smaller than 3
* @param[in] type type of activation functions
*
* @details
*
* This is the direct table look-up approach.
*
* Assume here the integer part of the fixed-point is <= 3.
* More than 3 just not making much sense, makes no difference with
* saturation followed by any of these activation functions.
*/
void arm_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type)
{
uint16_t i = size;
q7_t *pIn = data;
q7_t *pOut = data;
q7_t in;
q7_t out;
uint16_t shift_size = 3 - int_width;
const q7_t *lookup_table;
switch (type)
{
case ARM_SIGMOID:
lookup_table = sigmoidTable_q7;
break;
case ARM_TANH:
default:
lookup_table = tanhTable_q7;
break;
}
while (i)
{
in = *pIn++;
out = lookup_table[(uint8_t)(in >> shift_size)];
*pOut++ = out;
i--;
}
}
/**
* @} end of Acti group
*/

View File

@@ -0,0 +1,65 @@
/*
* Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_relu6_s8.c
* Description: Basic s8 version of ReLU6
*
* $Date: 09. October 2020
* $Revision: V.1.0.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/*
* Basic ReLU6 function
*
* Refer to header file for details.
*
*/
void arm_relu6_s8(q7_t *data, uint16_t size)
{
int32_t i;
for (i = 0; i < size; i++)
{
int32_t ip = data[i];
ip = MAX(ip, 0);
data[i] = MIN(ip, 6);
}
}
/**
* @} end of Acti group
*/

View File

@@ -0,0 +1,104 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_relu_q15.c
* Description: Q15 version of ReLU
*
* $Date: 09. October 2020
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief Q15 RELU function
* @param[in,out] data pointer to input
* @param[in] size number of elements
*
* @details
*
* Optimized relu with QSUB instructions.
*
*/
void arm_relu_q15(q15_t *data, uint16_t size)
{
#if defined(ARM_MATH_DSP)
/* Run the following code for M cores with DSP extension */
uint16_t i = size >> 1;
q15_t *input = data;
q15_t *output = data;
q31_t in;
q31_t buf;
q31_t mask;
while (i)
{
in = read_q15x2_ia(&input);
/* extract the first bit */
buf = __ROR(in & 0x80008000, 15);
/* if MSB=1, mask will be 0xFF, 0x0 otherwise */
mask = __QSUB16(0x00000000, buf);
arm_nn_write_q15x2_ia(&output, in & (~mask));
i--;
}
if (size & 0x1)
{
if (*input < 0)
{
*input = 0;
}
input++;
}
#else
/* Run the following code as reference implementation for M cores without DSP extension */
uint16_t i;
for (i = 0; i < size; i++)
{
if (data[i] < 0)
data[i] = 0;
}
#endif /* ARM_MATH_DSP */
}
/**
* @} end of Acti group
*/

View File

@@ -0,0 +1,109 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_relu_q7.c
* Description: Q7 version of ReLU
*
* $Date: 20. July 2021
* $Revision: V.1.1.3
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Acti
* @{
*/
/**
* @brief Q7 RELU function
* @param[in,out] data pointer to input
* @param[in] size number of elements
*
* @details
*
* Optimized relu with QSUB instructions.
*
*/
void arm_relu_q7(q7_t *data, uint16_t size)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for M cores with DSP extension */
uint16_t i = size >> 2;
q7_t *input = data;
q7_t *output = data;
q31_t in;
q31_t buf;
q31_t mask;
while (i)
{
in = arm_nn_read_q7x4_ia((const q7_t **)&input);
/* extract the first bit */
buf = (int32_t)__ROR((uint32_t)in & 0x80808080, 7);
/* if MSB=1, mask will be 0xFF, 0x0 otherwise */
mask = __QSUB8(0x00000000, buf);
arm_nn_write_q7x4_ia(&output, in & (~mask));
i--;
}
i = size & 0x3;
while (i)
{
if (*input < 0)
{
*input = 0;
}
input++;
i--;
}
#else
/* Run the following code as reference implementation for cores without DSP extension */
uint16_t i;
for (i = 0; i < size; i++)
{
if (data[i] < 0)
data[i] = 0;
}
#endif
}
/**
* @} end of Acti group
*/

View File

@@ -0,0 +1,31 @@
#
# Copyright (c) 2019-2021 Arm Limited. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
project(CMSISNNBasicMaths)
file(GLOB SRC "./*_*.c")
add_library(CMSISNNBasicMaths STATIC ${SRC})
### Includes
target_include_directories(CMSISNNBasicMaths PUBLIC "${NN}/Include")
target_include_directories(CMSISNNBasicMaths PUBLIC "${ROOT}/CMSIS/Core/Include")
target_include_directories(CMSISNNBasicMaths PUBLIC "${ROOT}/CMSIS/DSP/Include")

View File

@@ -0,0 +1,105 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_elementwise_add_s16
* Description: Elementwise add
*
* $Date: 14 Februari 2022
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup BasicMath
* @{
*/
/*
* s16 elementwise add
*
* Refer header file for details.
*
*/
/* Note: __SHIFT is expected to be <=0 */
arm_status arm_elementwise_add_s16(const int16_t *input_1_vect,
const int16_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_1_mult,
const int32_t input_1_shift,
const int32_t input_2_offset,
const int32_t input_2_mult,
const int32_t input_2_shift,
const int32_t left_shift,
int16_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const int32_t block_size)
{
(void)input_1_offset;
(void)input_2_offset;
(void)out_offset;
int32_t loop_count;
int32_t input_1;
int32_t input_2;
int32_t sum;
loop_count = block_size;
while (loop_count > 0)
{
/* C = A + B */
input_1 = *input_1_vect++ << left_shift;
input_2 = *input_2_vect++ << left_shift;
input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
sum = arm_nn_requantize(sum, out_mult, out_shift);
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
*output++ = (int16_t)sum;
/* Decrement loop counter */
loop_count--;
}
return (ARM_MATH_SUCCESS);
}
/**
* @} end of BasicMath group
*/

View File

@@ -0,0 +1,255 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_elementwise_add_s8
* Description: Element wise add
*
* $Date: 01. March 2021
* $Revision: V.2.5.3
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
#if defined(ARM_MATH_MVEI)
#include "arm_helium_utils.h"
#endif
#if defined(ARM_MATH_MVEI)
#define SAT_INPUT_VECT(__INPUT_V, __MULT, __SHIFT) \
__INPUT_V = arm_doubling_high_mult_mve(__INPUT_V, __MULT); \
__INPUT_V = arm_divide_by_power_of_two_mve(__INPUT_V, -__SHIFT);
#endif
/**
* @note The *_no_sat API does not mean that the input not saturated, Since
* __MULT is a positive integer, it is saturated. The API definition
* has more info about it.
*/
#define SAT_INPUT(__INPUT, __MULT, __SHIFT) \
__INPUT = arm_nn_doubling_high_mult_no_sat(__INPUT, __MULT); \
__INPUT = arm_nn_divide_by_power_of_two(__INPUT, -__SHIFT);
/**
* @ingroup groupNN
*/
/**
* @addtogroup BasicMath
* @{
*/
/*
* s8 element wise add
*
* Refer header file for details.
*
*/
/* Note: __SHIFT is expected to be <=0 */
arm_status arm_elementwise_add_s8(const int8_t *input_1_vect,
const int8_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_1_mult,
const int32_t input_1_shift,
const int32_t input_2_offset,
const int32_t input_2_mult,
const int32_t input_2_shift,
const int32_t left_shift,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const uint32_t block_size)
{
#if defined(ARM_MATH_MVEI)
int32_t count = (int32_t)block_size;
while (count > 0)
{
int32x4_t vect_1;
int32x4_t vect_2;
mve_pred16_t p = vctp32q((uint32_t)count);
vect_1 = vldrbq_z_s32(input_1_vect, p);
vect_2 = vldrbq_z_s32(input_2_vect, p);
vect_1 = vaddq_s32(vect_1, vdupq_n_s32(input_1_offset));
vect_2 = vaddq_s32(vect_2, vdupq_n_s32(input_2_offset));
vect_1 = vshlq_r_s32(vect_1, left_shift);
vect_2 = vshlq_r_s32(vect_2, left_shift);
SAT_INPUT_VECT(vect_1, input_1_mult, input_1_shift);
SAT_INPUT_VECT(vect_2, input_2_mult, input_2_shift);
vect_1 = vaddq_s32(vect_1, vect_2);
SAT_INPUT_VECT(vect_1, out_mult, out_shift);
vect_1 = vaddq_n_s32(vect_1, out_offset);
vect_1 = vmaxq_s32(vect_1, vdupq_n_s32(out_activation_min));
vect_1 = vminq_s32(vect_1, vdupq_n_s32(out_activation_max));
input_1_vect += 4;
input_2_vect += 4;
vstrbq_p_s32(output, vect_1, p);
output += 4;
count -= 4;
}
#else
uint32_t loop_count;
int32_t input_1;
int32_t input_2;
int32_t sum;
#if defined(ARM_MATH_DSP)
int32_t a_1, b_1, a_2, b_2;
int32_t offset_1_packed, offset_2_packed;
int8_t r1, r2, r3, r4;
offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL);
offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL);
loop_count = block_size >> 2;
while (loop_count > 0U)
{
/* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
intrinsic */
input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
a_1 = __SADD16(a_1, offset_1_packed);
b_1 = __SADD16(b_1, offset_1_packed);
a_2 = __SADD16(a_2, offset_2_packed);
b_2 = __SADD16(b_2, offset_2_packed);
/* Sum 1 */
input_1 = (b_1 & 0x0FFFF) << left_shift;
SAT_INPUT(input_1, input_1_mult, input_1_shift);
input_2 = (b_2 & 0x0FFFF) << left_shift;
SAT_INPUT(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r1 = (q7_t)sum;
/* Sum 3 */
input_1 = ((b_1 >> 16) & 0x0FFFF) << left_shift;
SAT_INPUT(input_1, input_1_mult, input_1_shift);
input_2 = ((b_2 >> 16) & 0x0FFFF) << left_shift;
SAT_INPUT(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r3 = (q7_t)sum;
/* Sum 2 */
input_1 = (a_1 & 0x0FFFF) << left_shift;
SAT_INPUT(input_1, input_1_mult, input_1_shift);
input_2 = (a_2 & 0x0FFFF) << left_shift;
SAT_INPUT(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r2 = (q7_t)sum;
/* Sum 4 */
input_1 = ((a_1 >> 16) & 0x0FFFF) << left_shift;
SAT_INPUT(input_1, input_1_mult, input_1_shift);
input_2 = ((a_2 >> 16) & 0x0FFFF) << left_shift;
SAT_INPUT(input_2, input_2_mult, input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
r4 = (q7_t)sum;
write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4));
loop_count--;
}
loop_count = block_size & 0x3;
#else
loop_count = block_size;
#endif
while (loop_count > 0U)
{
/* C = A + B */
input_1 = (*input_1_vect++ + input_1_offset) << left_shift;
input_2 = (*input_2_vect++ + input_2_offset) << left_shift;
input_1 = arm_nn_doubling_high_mult(input_1, input_1_mult);
input_1 = arm_nn_divide_by_power_of_two(input_1, -input_1_shift);
input_2 = arm_nn_doubling_high_mult(input_2, input_2_mult);
input_2 = arm_nn_divide_by_power_of_two(input_2, -input_2_shift);
sum = input_1 + input_2;
SAT_INPUT(sum, out_mult, out_shift);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
*output++ = (q7_t)sum;
/* Decrement loop counter */
loop_count--;
}
#endif /* ARM_MATH_MVEI */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of BasicMath group
*/

View File

@@ -0,0 +1,95 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_elementwise_mul_s16
* Description: Element wise multiplication
*
* $Date: 14 Februari 2022
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup BasicMath
* @{
*/
/**
* @brief s16 element wise multiplication of two vectors
*
* @note Refer header file for details.
*
*/
arm_status arm_elementwise_mul_s16(const int16_t *input_1_vect,
const int16_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_2_offset,
int16_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const int32_t block_size)
{
(void)input_1_offset;
(void)input_2_offset;
(void)out_offset;
int32_t loop_count;
int32_t input_1;
int32_t input_2;
int32_t mul_res;
loop_count = block_size;
while (loop_count > 0)
{
/* C = A * B */
input_1 = *input_1_vect++;
input_2 = *input_2_vect++;
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift);
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
*output++ = (int16_t)mul_res;
/* Decrement loop counter */
loop_count--;
}
return ARM_MATH_SUCCESS;
}
/**
* @} end of BasicMath group
*/

View File

@@ -0,0 +1,200 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_elementwise_mul_s8
* Description: Element wise multiplication
*
* $Date: January 26, 2021
* $Revision: V.1.0.5
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup BasicMath
* @{
*/
/**
* @brief s8 element wise multiplication of two vectors
*
* @note Refer header file for details.
*
*/
arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
const int8_t *input_2_vect,
const int32_t input_1_offset,
const int32_t input_2_offset,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t out_activation_min,
const int32_t out_activation_max,
const uint32_t block_size)
{
int32_t loop_count;
#if defined(ARM_MATH_MVEI)
loop_count = (block_size + 3) / 4;
uint32_t num_elements = block_size;
for (int i = 0; i < loop_count; i++)
{
mve_pred16_t p = vctp32q(num_elements);
int32x4_t input_1 = vldrbq_z_s32(input_1_vect, p);
input_1 = vaddq_n_s32(input_1, input_1_offset);
int32x4_t input_2 = vldrbq_z_s32(input_2_vect, p);
input_2 = vaddq_n_s32(input_2, input_2_offset);
int32x4_t res_0 = vmulq_s32(input_1, input_2);
res_0 = arm_requantize_mve_32x4(res_0, vdupq_n_s32(out_mult), vdupq_n_s32(out_shift));
res_0 += vdupq_n_s32(out_offset);
res_0 = vmaxq_s32(res_0, vdupq_n_s32(out_activation_min));
res_0 = vminq_s32(res_0, vdupq_n_s32(out_activation_max));
vstrbq_p_s32(output, res_0, p);
input_1_vect += 4;
input_2_vect += 4;
output += 4;
num_elements -= 4;
}
#else
int32_t input_1;
int32_t input_2;
int32_t mul_res;
#if defined(ARM_MATH_DSP)
int32_t a_1, b_1, a_2, b_2;
int32_t offset_1_packed, offset_2_packed;
int8_t r1, r2, r3, r4;
offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL);
offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL);
loop_count = block_size >> 2;
while (loop_count > 0)
{
/* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
intrinsic */
input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
a_1 = __SADD16(a_1, offset_1_packed);
b_1 = __SADD16(b_1, offset_1_packed);
a_2 = __SADD16(a_2, offset_2_packed);
b_2 = __SADD16(b_2, offset_2_packed);
/* Mul 1 */
input_1 = (int16_t)(b_1 & 0x0FFFFL);
input_2 = (int16_t)(b_2 & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r1 = (q7_t)mul_res;
/* Mul 3 */
input_1 = (int16_t)((b_1 >> 16U) & 0x0FFFFL);
input_2 = (int16_t)((b_2 >> 16U) & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r3 = (q7_t)mul_res;
/* Mul 2 */
input_1 = (int16_t)(a_1 & 0x0FFFFL);
input_2 = (int16_t)(a_2 & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r2 = (q7_t)mul_res;
/* Mul 4 */
input_1 = (int16_t)((a_1 >> 16U) & 0x0FFFFL);
input_2 = (int16_t)((a_2 >> 16U) & 0x0FFFFL);
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
r4 = (q7_t)mul_res;
write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4));
loop_count--;
}
loop_count = block_size & 0x3;
#else
loop_count = block_size;
#endif
while (loop_count > 0)
{
/* C = A * B */
input_1 = *input_1_vect++ + input_1_offset;
input_2 = *input_2_vect++ + input_2_offset;
mul_res = input_1 * input_2;
mul_res = arm_nn_requantize(mul_res, out_mult, out_shift) + out_offset;
mul_res = MAX(mul_res, out_activation_min);
mul_res = MIN(mul_res, out_activation_max);
*output++ = (q7_t)mul_res;
/* Decrement loop counter */
loop_count--;
}
#endif
return ARM_MATH_SUCCESS;
}
/**
* @} end of BasicMath group
*/

View File

@@ -0,0 +1,98 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
SET(ROOT ${CMSIS_PATH})
# Select which parts of the CMSIS-DSP must be compiled.
# There are some dependencies between the parts but they are not tracked
# by this cmake. So, enabling some functions may require to enable some
# other ones.
option(CONCATENATION "Concatenation" ON)
option(FULLYCONNECTED "Fully Connected" ON)
option(CONVOLUTION "Convolutions" ON)
option(ACTIVATION "Activations" ON)
option(POOLING "Pooling" ON)
option(SOFTMAX "Softmax" ON)
option(BASICMATHSNN "Basic Maths for NN" ON)
option(RESHAPE "Reshape" ON)
option(SVDF "SVDF" ON)
# When OFF it is the default behavior : all tables are included.
option(NNSUPPORT "NN Support" ON)
###########################
#
# CMSIS NN
#
###########################
# NN Sources
SET(NN ${ROOT}/CMSIS/NN)
list(APPEND CMAKE_MODULE_PATH ${NN}/Source)
add_library(cmsis-nn STATIC)
target_compile_options(cmsis-nn PRIVATE -Ofast)
### Includes
target_include_directories(cmsis-nn PUBLIC "${NN}/Include")
target_include_directories(cmsis-nn PUBLIC "${ROOT}/CMSIS/Core/Include")
target_include_directories(cmsis-nn PUBLIC "${ROOT}/CMSIS/DSP/Include")
if (BASICMATHSNN)
add_subdirectory(BasicMathFunctions)
endif()
if (CONCATENATION)
add_subdirectory(ConcatenationFunctions)
endif()
if (FULLYCONNECTED)
add_subdirectory(FullyConnectedFunctions)
endif()
if (CONVOLUTION)
add_subdirectory(ConvolutionFunctions)
endif()
if (ACTIVATION)
add_subdirectory(ActivationFunctions)
endif()
if (POOLING)
add_subdirectory(PoolingFunctions)
endif()
if (SOFTMAX)
add_subdirectory(SoftmaxFunctions)
endif()
if (SVDF)
add_subdirectory(SVDFunctions)
endif()
if (RESHAPE)
add_subdirectory(ReshapeFunctions)
endif()
# Keep NNSUPPORT at the end
if (NNSUPPORT)
add_subdirectory(NNSupportFunctions)
endif()

View File

@@ -0,0 +1,20 @@
#
# Copyright (c) 2019-2021 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_*.c")
target_sources(cmsis-nn PRIVATE ${SRC})

View File

@@ -0,0 +1,66 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_w.c
* Description: s8 version of concatenation along the W axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the W axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_w(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint32_t offset_w)
{
const uint32_t input_copy_size = input_x * input_y * input_z * input_w;
output += offset_w * (input_x * input_y * input_z);
arm_memcpy_q7(output, input, input_copy_size);
}
/**
* @} end of Concatenation group
*/

View File

@@ -0,0 +1,75 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_x.c
* Description: s8 version of concatenation along the X axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the X axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_x(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_x,
const uint32_t offset_x)
{
const uint32_t num_iterations = input_y * input_z * input_w;
output += offset_x;
uint32_t i;
// Copy per row
for (i = 0; i < num_iterations; ++i)
{
arm_memcpy_q7(output, input, input_x);
input += input_x;
output += output_x;
}
}
/**
* @} end of Concatenation group
*/

View File

@@ -0,0 +1,76 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_y.c
* Description: s8 version of concatenation along the Y axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the Y axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_y(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_y,
const uint32_t offset_y)
{
const uint32_t num_iterations = input_z * input_w;
const uint32_t input_copy_size = input_x * input_y;
const uint32_t output_stride = input_x * output_y;
output += offset_y * input_x;
uint32_t i;
// Copy per tile
for (i = 0; i < num_iterations; ++i)
{
arm_memcpy_q7(output, input, input_copy_size);
input += input_copy_size;
output += output_stride;
}
}
/**
* @} end of Concatenation group
*/

View File

@@ -0,0 +1,75 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_concatenation_s8_z.c
* Description: s8 version of concatenation along the Z axis
*
* $Date: October 2019
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup Concatenation
* @{
*/
/*
* s8 version of concatenation along the Z axis
*
* Refer to header file for details.
*
*/
void arm_concatenation_s8_z(const int8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_z,
const uint16_t input_w,
int8_t *output,
const uint16_t output_z,
const uint32_t offset_z)
{
const uint32_t input_copy_size = input_x * input_y * input_z;
const uint32_t output_stride = input_x * input_y * output_z;
output += offset_z * (input_x * input_y);
uint32_t i;
for (i = 0; i < input_w; ++i)
{
arm_memcpy_q7(output, input, input_copy_size);
input += input_copy_size;
output += output_stride;
}
}
/**
* @} end of Concatenation group
*/

View File

@@ -0,0 +1,24 @@
#
# Copyright (c) 2019-2022 Arm Limited.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
file(GLOB SRC "./*_s8*.c")
file(GLOB SRC_S16 "./*_s16*.c")
target_sources(cmsis-nn PRIVATE ${SRC} ${SRC_S16})

View File

@@ -0,0 +1,205 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_1_x_n_s8.c
* Description: s8 version of 1xN convolution using symmetric quantization.
*
* $Date: December 14, 2021
* $Revision: V.2.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* 1xN s8 convolution function.
*
* Refer header file for details.
*
*/
arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
(void)bias_dims;
arm_status status = ARM_MATH_SUCCESS;
if (output_dims->w % 4 != 0)
{
status = ARM_MATH_SIZE_MISMATCH;
goto out;
}
#if defined(ARM_MATH_MVEI)
(void)ctx;
const uint16_t input_x = input_dims->w;
const uint16_t kernel_x = filter_dims->w;
const uint16_t output_x = output_dims->w;
const uint16_t output_ch = output_dims->c;
const uint16_t input_ch = input_dims->c;
const uint16_t pad_x = conv_params->padding.w;
const uint16_t stride_x = conv_params->stride.w;
const int32_t input_offset = conv_params->input_offset;
const int32_t out_offset = conv_params->output_offset;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
for (int i_out_x = 0; i_out_x <= (output_x - 4); i_out_x += 4)
{
int32_t input_begin_idx[4];
int32_t ker_begin_idx[4];
int32_t ker_end_idx[4];
for (int i = 0; i < 4; i++)
{
const int32_t est_input_x_idx = stride_x * (i_out_x + i) - pad_x;
input_begin_idx[i] = MAX(0, est_input_x_idx);
ker_begin_idx[i] = MAX(0, -est_input_x_idx);
ker_end_idx[i] = MIN(kernel_x, input_x - est_input_x_idx);
}
if ((ker_begin_idx[0] != 0) || (ker_end_idx[3] != kernel_x))
{
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32x4_t s_offset;
int32_t acc[4];
{
int32_t sum_row[4];
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[0] - ker_begin_idx[0]) * input_ch,
input_data + input_begin_idx[0] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) +
(ker_begin_idx[0] * input_ch),
&sum_row[0],
&acc[0]);
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[1] - ker_begin_idx[1]) * input_ch,
input_data + input_begin_idx[1] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) +
(ker_begin_idx[1] * input_ch),
&sum_row[1],
&acc[1]);
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[2] - ker_begin_idx[2]) * input_ch,
input_data + input_begin_idx[2] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) +
(ker_begin_idx[2] * input_ch),
&sum_row[2],
&acc[2]);
(void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[3] - ker_begin_idx[3]) * input_ch,
input_data + input_begin_idx[3] * input_ch,
filter_data + (input_ch * kernel_x * i_out_ch) +
(ker_begin_idx[3] * input_ch),
&sum_row[3],
&acc[3]);
s_offset = vldrwq_s32(sum_row);
}
int32x4_t res = vldrwq_s32(acc);
s_offset = vmulq_n_s32(s_offset, input_offset);
res = vaddq_s32(res, s_offset);
if (bias_data)
{
res = vaddq_n_s32(res, bias_data[i_out_ch]);
}
res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
res = vaddq_n_s32(res, out_offset);
res = vmaxq_s32(res, vdupq_n_s32(out_activation_min));
res = vminq_s32(res, vdupq_n_s32(out_activation_max));
const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3};
vstrbq_scatter_offset_s32(output_data, scatter_offset, res);
output_data++;
}
output_data += (3 * output_ch);
}
else
{
output_data = arm_nn_mat_mul_core_4x_s8(kernel_x * input_ch,
stride_x * input_ch,
input_data + input_begin_idx[0] * input_ch,
filter_data,
output_ch,
conv_params,
quant_params,
bias_data,
output_data);
}
}
#else
status = arm_convolve_s8(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
#endif
out:
/* Return to application */
return status;
}
int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
#if !defined(ARM_MATH_MVEI)
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
#else
(void)input_dims;
(void)filter_dims;
return 0;
#endif
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,235 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_1x1_HWC_q7_fast_nonsquare.c
* Description: Fast Q7 version of 1x1 convolution (non-square shape)
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is optimized for convolution with 1x1 kernel size (i.e., dim_kernel_x=1
* and dim_kernel_y=1). It can be used for the second half of MobileNets [1] after depthwise
* separable convolution.
*
* This function is the version with full list of optimization tricks, but with
* some constraints:
* ch_im_in is multiple of 4
* ch_im_out is multiple of 2
*
* [1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications
* https://arxiv.org/abs/1704.04861
*/
arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
(void)dim_im_in_y;
int16_t i_out_y, i_out_x;
int16_t i_ch_out;
/* -----------------------
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1 || padding_x != 0 ||
padding_y != 0 || stride_x != 1 || stride_y != 1)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_out_y * dim_im_in_x + i_out_x) * ch_im_in, pBuffer, ch_im_in);
pBuffer += ch_im_in;
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* check if there is left-over for compute */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
{
q31_t sum = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = bufferA;
/* basically each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad_reordered(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q7_t)__SSAT((sum >> out_shift), 8);
pOut++;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0 || dim_kernel_x != 1 || dim_kernel_y != 1 || padding_x != 0 ||
padding_y != 0 || stride_x != 1 || stride_y != 1)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
// if-for implementation
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_y + n) * ch_im_in +
l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,161 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_1x1_s8_fast.c
* Description: Fast q7 version of 1x1 convolution (non-square shape)
*
* $Date: 12. November 2021
* $Revision: V.2.0.4
*
* Target Processor: Cortex-M Processors
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
#include <stdio.h>
#define DIM_KER_X (1U)
#define DIM_KER_Y (1U)
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Fast s8 version for 1x1 convolution (non-square shape)
*
* Refer header file for details.
*
*/
arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
if (input_dims->c % 4 != 0 || conv_params->padding.w != 0 || conv_params->padding.h != 0 ||
conv_params->stride.w != 1 || conv_params->stride.h != 1)
{
return ARM_MATH_SIZE_MISMATCH;
}
(void)ctx;
(void)filter_dims;
(void)bias_dims;
#if defined(ARM_MATH_MVEI)
const int32_t col_len = input_dims->w * input_dims->h * input_dims->n;
const int32_t output_ch = output_dims->c;
const int32_t input_ch = input_dims->c;
const int32_t input_offset = conv_params->input_offset;
const int32_t out_offset = conv_params->output_offset;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
for (int i_items = 0; i_items <= (col_len - 4); i_items += 4)
{
output_data = arm_nn_mat_mul_core_4x_s8(input_ch,
input_ch,
input_data + i_items * input_ch,
filter_data,
output_ch,
conv_params,
quant_params,
bias_data,
output_data);
}
/* Handle left over elements */
for (int i_items = (col_len & ~0x3); i_items < col_len; i_items++)
{
for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
int32_t sum_row = 0;
int32_t acc;
(void)arm_nn_mat_mul_core_1x_s8(
input_ch, input_data + i_items * input_ch, filter_data + i_out_ch * input_ch, &sum_row, &acc);
if (bias_data)
{
acc += bias_data[i_out_ch];
}
sum_row = (sum_row * input_offset);
acc += sum_row;
acc = arm_nn_requantize(acc, output_mult[i_out_ch], output_shift[i_out_ch]);
acc += out_offset;
acc = MAX(acc, out_activation_min);
acc = MIN(acc, out_activation_max);
*output_data++ = acc;
}
}
#else
/* Run the following code as reference implementation for Cortex-M processors with or without DSP extension */
const int32_t lhs_rows = input_dims->w * input_dims->h * input_dims->n;
const int32_t rhs_rows = output_dims->c;
const int32_t rhs_cols = input_dims->c;
arm_nn_mat_mult_nt_t_s8(input_data,
filter_data,
bias_data,
output_data,
quant_params->multiplier,
quant_params->shift,
lhs_rows,
rhs_rows,
rhs_cols,
conv_params->input_offset,
conv_params->output_offset,
conv_params->activation.min,
conv_params->activation.max);
#endif
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims)
{
(void)input_dims;
return 0;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,209 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q15_basic.c
* Description: Q15 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Basic Q15 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* This basic version is designed to work for any input tensor and weight
* dimension.
*/
arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q15_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q15_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
uint16_t im2col_out_pixel_index = 0;
q15_t *pBuffer = bufferA;
q15_t *pOut = Im_out;
q15_t *im_buffer = bufferA;
const q15_t *pA;
int i;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* Filling 0 for out-of-bound paddings */
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer,
* ch_im_in); */
memcpy(pBuffer,
(q15_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in,
sizeof(q15_t) * ch_im_in);
}
pBuffer += ch_im_in;
}
}
pA = wt;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = im_buffer;
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inA2 = arm_nn_read_q15x2_ia(&pA);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q15_t)__SSAT((sum >> out_shift), 16);
pOut++;
}
/* counter reset */
pBuffer = im_buffer;
im2col_out_pixel_index++;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,259 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q15_fast.c
* Description: Fast Q15 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q15 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in is multiple of 2
*
* ch_im_out is multiple of 2
*
* dim_im_out is a multiple of 2
*
*/
arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q15_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q15_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
q15_t *pBuffer = bufferA;
q15_t *im_buffer = bufferA;
q15_t *pOut = Im_out;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0 || dim_im_out & 0x1)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/* Run the following code for Cortex-M4 and Cortex-M7 */
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer,
* ch_im_in); */
memcpy(pBuffer,
(q15_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in,
sizeof(q15_t) * ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (i_out_x & 0x1)
{
int i;
/* initialize the matrix pointers for A */
const q15_t *pA = wt;
/* set up the second output pointers */
q15_t *pOut2 = pOut + ch_im_out;
/* this loop over rows in A */
for (i = 0; i < ch_im_out; i += 2)
{
/* setup pointers for B */
const q15_t *pB = im_buffer;
const q15_t *pB2 = pB + ch_im_in * dim_kernel * dim_kernel;
/* aling the second pointer for A */
const q15_t *pA2 = pA + ch_im_in * dim_kernel * dim_kernel;
/* init the sum with bias */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 1;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inA2 = arm_nn_read_q15x2_ia(&pA2);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA1, inB1, sum);
sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x1;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inB1 = *pB++;
q15_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q15_t)__SSAT(sum >> out_shift, 16);
*pOut++ = (q15_t)__SSAT(sum3 >> out_shift, 16);
*pOut2++ = (q15_t)__SSAT(sum2 >> out_shift, 16);
*pOut2++ = (q15_t)__SSAT(sum4 >> out_shift, 16);
/* skip the row computed with A2 */
pA += ch_im_in * dim_kernel * dim_kernel;
} /* for over ch_im_out */
pOut += ch_im_out;
/* counter reset */
pBuffer = im_buffer;
}
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,270 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q15_fast.c
* Description: Fast Q15 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q15 convolution function (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in is multiple of 2
*
* ch_im_out is multiple of 2
*
*/
arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q15_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q15_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q15_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
q15_t *pBuffer = bufferA;
q15_t *im_buffer = bufferA;
q15_t *pOut = Im_out;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/* Run the following code for Cortex-M4 and Cortex-M7 */
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* arm_copy_q15((q15_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer,
* ch_im_in); */
memcpy(pBuffer,
(q15_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in,
sizeof(q15_t) * ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (i_out_x & 0x1)
{
int i;
/* initialize the matrix pointers for A */
const q15_t *pA = wt;
/* set up the second output pointers */
q15_t *pOut2 = pOut + ch_im_out;
/* this loop over rows in A */
for (i = 0; i < ch_im_out; i += 2)
{
/* setup pointers for B */
const q15_t *pB = im_buffer;
const q15_t *pB2 = pB + ch_im_in * dim_kernel_y * dim_kernel_x;
/* aling the second pointer for A */
const q15_t *pA2 = pA + ch_im_in * dim_kernel_y * dim_kernel_x;
/* init the sum with bias */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)bias[i + 1] << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 1;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA1 = arm_nn_read_q15x2_ia(&pA);
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inA2 = arm_nn_read_q15x2_ia(&pA2);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA1, inB1, sum);
sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x1;
while (colCnt)
{
q15_t inA1 = *pA++;
q15_t inB1 = *pB++;
q15_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q15_t)__SSAT(sum >> out_shift, 16);
*pOut++ = (q15_t)__SSAT(sum3 >> out_shift, 16);
*pOut2++ = (q15_t)__SSAT(sum2 >> out_shift, 16);
*pOut2++ = (q15_t)__SSAT(sum4 >> out_shift, 16);
/* skip the row computed with A2 */
pA += ch_im_in * dim_kernel_y * dim_kernel_x;
} /* for over ch_im_out */
pOut += ch_im_out;
/* counter reset */
pBuffer = im_buffer;
}
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 2 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_x * dim_kernel_y + (m * dim_kernel_x + n) * ch_im_in +
l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q15_t)__SSAT((conv_out >> out_shift), 16);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,280 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_RGB.c
* Description: Q7 version of convolution for RGB image
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Q7 convolution function for RGB image
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in equals 3
*
* This kernel is written exclusively for convolution with ch_im_in
* equals 3. This applies on the first layer of CNNs which has input
* image with RGB format.
*/
arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
// check if number of input channels is 3
if (ch_im_in != 3)
{
return ARM_MATH_SIZE_MISMATCH;
}
// This part implements the im2col function
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* Equivalent to arm_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */
arm_memset_q7((q7_t *)pBuffer, (q7_t)0, 3 * sizeof(q15_t));
pBuffer += 3;
}
else
{
/*
* Equivalent to:
* arm_q7_to_q15_no_shift( (q7_t*)Im_in+(i_ker_y*dim_im_in+i_ker_x)*3, pBuffer, 3);
*/
const q7_t *pPixel = Im_in + (i_ker_y * dim_im_in + i_ker_x) * 3;
q31_t buf = arm_nn_read_q7x4(pPixel);
union arm_nnword top;
union arm_nnword bottom;
top.word = __SXTB16(buf);
bottom.word = __SXTB16(__ROR(buf, 8));
#ifndef ARM_MATH_BIG_ENDIAN
/*
* little-endian, | omit | 3rd | 2nd | 1st |
* MSB LSB
* top | 3rd | 1st |; bottom | omit | 2nd |
*
* version 1, need to swap 2nd and 3rd weight
* *__SIMD32(pBuffer) = top.word;
* *(pBuffer+2) = bottom.half_words[0];
*
* version 2, no weight shuffling required
*/
*pBuffer++ = top.half_words[0];
int32_t packed_word = __PKHBT(bottom.word, top.word, 0);
arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4);
#else
/*
* big-endian, | 1st | 2nd | 3rd | omit |
* MSB LSB
* top | 2nd | omit |; bottom | 1st | 3rd |
*
* version 1, need to swap 2nd and 3rd weight
* *__SIMD32(pBuffer) = bottom.word;
* *(pBuffer+2) = top.half_words[1];
*
* version 2, no weight shuffling required
*/
*pBuffer++ = bottom.half_words[0];
int32_t packed_word = __PKHTB(top.word, bottom.word, 0);
arm_memcpy_q7((q7_t *)pBuffer, (q7_t *)&packed_word, 4);
#endif
pBuffer += 2;
}
}
}
if (pBuffer == bufferA + 2 * 3 * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15(
wt, bufferA, ch_im_out, 3 * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* left-over because odd number of output pixels */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
q15_t *pB = bufferA;
/* basically each time it process 4 entries */
uint16_t colCnt = 3 * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia((const q15_t **)&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia((const q15_t **)&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = 3 * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
// check if number of input channels is 3
if (ch_im_in != 3)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
/* if-for implementation */
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return (ARM_MATH_SUCCESS);
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,227 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_basic.c
* Description: Q7 version of convolution
*
* $Date: 20. July 2021
* $Revision: V.1.1.1
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Basic Q7 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* This basic version is designed to work for any input tensor and weight
* dimension.
*/
arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* Filling 0 for out-of-bound paddings */
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* Copying the pixel data to column */
arm_q7_to_q15_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* Computation is filed for every 2 columns */
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* left-over because odd number of output pixels */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
/* Load the accumulator with bias first */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
/* Point to the beging of the im2col buffer */
const q15_t *pB = bufferA;
/* Each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
(void)bufferA;
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
// if-for implementation
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,229 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_basic.c
* Description: Q7 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Basic Q7 convolution function (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns <code>ARM_MATH_SUCCESS</code>
*/
arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* Filling 0 for out-of-bound paddings */
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
/* Copying the pixel data to column */
arm_q7_to_q15_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* Computation is filed for every 2 columns */
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_y * dim_kernel_x)
{
pOut = arm_nn_mat_mult_kernel_q7_q15(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_y * dim_kernel_x, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* left-over because odd number of output pixels */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
/* Load the accumulator with bias first */
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
/* Point to the beging of the im2col buffer */
const q15_t *pB = bufferA;
/* Each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
}
}
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
(void)bufferA;
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
// if-for implementation
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in +
l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,380 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_fast.c
* Description: Fast Q7 version of convolution
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q7 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in is multiple of 4 ( because of the SIMD32 read and swap )
*
* ch_im_out is multiple of 2 ( bacause 2x2 mat_mult kernel )
*
* The im2col converts the Q7 tensor input into Q15 column, which is stored in
* bufferA. There is reordering happenning during this im2col process with
* arm_q7_to_q15_reordered_no_shift. For every four elements, the second and
* third elements are swapped.
*
* The computation kernel arm_nn_mat_mult_kernel_q7_q15_reordered does the
* GEMM computation with the reordered columns.
*
* To speed-up the determination of the padding condition, we split the
* computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}.
* This reduces the total number of boundary condition checks and improves
* the data copying performance.
*/
arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/*
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/*
* Here we split the entire matrix into three regions depending on the padding situation
* Top: i_out_y from 0 to padding - 1
* Middle: i_out_y from padding to dim_im_out-padding-1
* Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
*/
/* top part */
for (i_out_y = 0; i_out_y < padding; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* middle part, here we also divide the x into left, mid and right */
for (; i_out_y < dim_im_out - padding; i_out_y++)
{
/* left part */
for (i_out_x = 0; i_out_x < padding; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* mid part */
for (; i_out_x < dim_im_out - padding; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
arm_q7_to_q15_reordered_no_shift((q7_t *)Im_in +
(i_ker_y * dim_im_in + i_out_x * stride - padding) * ch_im_in,
pBuffer,
ch_im_in * dim_kernel);
pBuffer += ch_im_in * dim_kernel;
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* right part */
for (; i_out_x < dim_im_out; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
for (; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel * dim_kernel)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* check if there is left-over for compute */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = bufferA;
/* each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel * dim_kernel >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad_reordered(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = ch_im_in * dim_kernel * dim_kernel & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q7_t)__SSAT((sum >> out_shift), 8);
pOut++;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out; j++)
{
for (k = 0; k < dim_im_out; k++)
{
conv_out = (bias[i] << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel; m++)
{
for (n = 0; n < dim_kernel; n++)
{
// if-for implementation
in_row = stride * j + m - padding;
in_col = stride * k + n - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel * dim_kernel + (m * dim_kernel + n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,378 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_HWC_q7_fast_nonsquare.c
* Description: Fast Q7 version of convolution (non-sqaure shape)
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Fast Q7 convolution function (non-sqaure shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimention x
* @param[in] dim_im_in_y input tensor dimention y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding size x
* @param[in] padding_y padding size y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is the version with full list of optimization tricks, but with
* some constraints:
* ch_im_in is multiple of 4
* ch_im_out is multiple of 2
*/
arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/* -----------------------
* Here we use bufferA as q15_t internally as computation are done with q15_t level
* im2col are done to output in q15_t format from q7_t input
*/
q15_t *pBuffer = bufferA;
q7_t *pOut = Im_out;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
/*
* Here we split the entire matrix into three regions depending on the padding situation
* Top: i_out_y from 0 to padding - 1
* Middle: i_out_y from padding to dim_im_out-padding-1
* Bottom: i_out_y from dim_im_out-padding to dim_im_out-1
*/
/* top part */
for (i_out_y = 0; i_out_y < padding_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* middle part, here we also divide the x into left, mid and right */
for (; i_out_y < dim_im_out_y - padding_y; i_out_y++)
{
/* left part */
for (i_out_x = 0; i_out_x < padding_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* mid part */
for (; i_out_x < dim_im_out_x - padding_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_out_x * stride_x - padding_x) * ch_im_in,
pBuffer,
ch_im_in * dim_kernel_x);
pBuffer += ch_im_in * dim_kernel_x;
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
/* right part */
for (; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
for (; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* This part implements the im2col function */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q15(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, sizeof(q15_t) * ch_im_in);
}
else
{
arm_q7_to_q15_reordered_no_shift(
(q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
}
pBuffer += ch_im_in;
}
}
if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_x * dim_kernel_y)
{
pOut = arm_nn_mat_mult_kernel_q7_q15_reordered(
wt, bufferA, ch_im_out, ch_im_in * dim_kernel_x * dim_kernel_y, bias_shift, out_shift, bias, pOut);
/* counter reset */
pBuffer = bufferA;
}
}
}
/* check if there is left-over for compute */
if (pBuffer != bufferA)
{
const q7_t *pA = wt;
int i;
for (i = 0; i < ch_im_out; i++)
{
q31_t sum = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
const q15_t *pB = bufferA;
/* basically each time it process 4 entries */
uint16_t colCnt = ch_im_in * dim_kernel_x * dim_kernel_y >> 2;
while (colCnt)
{
q31_t inA1, inA2;
q31_t inB1, inB2;
pA = read_and_pad_reordered(pA, &inA1, &inA2);
inB1 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA1, inB1, sum);
inB2 = arm_nn_read_q15x2_ia(&pB);
sum = __SMLAD(inA2, inB2, sum);
colCnt--;
}
colCnt = (ch_im_in * dim_kernel_y * dim_kernel_x) & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
sum += inA1 * inB1;
colCnt--;
}
*pOut = (q7_t)__SSAT((sum >> out_shift), 8);
pOut++;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i, j, k, l, m, n;
int conv_out;
int in_row, in_col;
if (ch_im_in % 4 != 0 || ch_im_out % 2 != 0)
{
/* check if the input dimension meets the constraints */
return ARM_MATH_SIZE_MISMATCH;
}
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_y; j++)
{
for (k = 0; k < dim_im_out_x; k++)
{
conv_out = ((q31_t)(bias[i]) << bias_shift) + NN_ROUND(out_shift);
for (m = 0; m < dim_kernel_y; m++)
{
for (n = 0; n < dim_kernel_x; n++)
{
/* if-for implementation */
in_row = stride_y * j + m - padding_y;
in_col = stride_x * k + n - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + (m * dim_kernel_x + n) * ch_im_in +
l];
}
}
}
}
Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,241 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_fast_s16.c
* Description: Optimized s16 version of convolution.
*
* $Date: 12 August 2021
* $Revision: V.1.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Basic s16 convolution function.
*
* Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
* are multiples of 4 or atleast greater than 4.
*
*/
arm_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data)
{
(void)bias_dims;
if (filter_dims->w * filter_dims->h * input_dims->c >= 512)
{
return ARM_MATH_SIZE_MISMATCH;
}
if (ctx->buf == NULL && arm_convolve_s8_get_buffer_size(input_dims, filter_dims) > 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
q15_t *buffer_a = (q15_t *)ctx->buf;
const int32_t input_batches = input_dims->n;
const int32_t input_x = input_dims->w;
const int32_t input_y = input_dims->h;
const int32_t input_ch = input_dims->c;
const int32_t kernel_x = filter_dims->w;
const int32_t kernel_y = filter_dims->h;
const int32_t output_x = output_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_ch = output_dims->c;
const int32_t pad_x = conv_params->padding.w;
const int32_t pad_y = conv_params->padding.h;
const int32_t stride_x = conv_params->stride.w;
const int32_t stride_y = conv_params->stride.h;
const int16_t out_activation_min = conv_params->activation.min;
const int16_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Generate two columns from the input tensor a GEMM computation */
q15_t *two_column_buf = buffer_a;
q15_t *out = output_data;
/* This part implements the im2col function */
for (int32_t i_out_y = 0; i_out_y < output_y; i_out_y++)
{
for (int32_t i_out_x = 0; i_out_x < output_x; i_out_x++)
{
for (int32_t i_ker_y = i_out_y * stride_y - pad_y; i_ker_y < i_out_y * stride_y - pad_y + kernel_y;
i_ker_y++)
{
for (int32_t i_ker_x = i_out_x * stride_x - pad_x; i_ker_x < i_out_x * stride_x - pad_x + kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
{
/* Filling 0 for out-of-bound paddings */
arm_memset_q7((q7_t *)two_column_buf, 0, sizeof(q15_t) * input_ch);
}
else
{
arm_memcpy_q7((q7_t *)two_column_buf,
(const q7_t *)(input_data + (i_ker_y * input_x + i_ker_x) * input_ch),
input_ch * sizeof(q15_t));
}
two_column_buf += input_ch;
}
}
/* Computation is filed for every 2 columns */
if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
{
out = arm_nn_mat_mult_kernel_s16(filter_data,
buffer_a,
output_ch,
output_shift,
output_mult,
out_activation_min,
out_activation_max,
(input_ch * kernel_y * kernel_x),
bias_data,
out);
/* Counter reset */
two_column_buf = buffer_a;
}
}
}
/* Left-over because odd number of output pixels */
if (two_column_buf != buffer_a)
{
const q7_t *ker_a = filter_data;
int i;
for (i = 0; i < output_ch; i++)
{
/* Init the accumulator*/
q31_t sum = 0;
/* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
const q15_t *ip_as_col = buffer_a;
/* 4 multiply and accumulates are done in one loop. */
uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
while (col_count)
{
q31_t ker_a1, ker_a2;
q31_t ip_b1, ip_b2;
ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
sum = __SMLAD(ker_a1, ip_b1, sum);
ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
sum = __SMLAD(ker_a2, ip_b2, sum);
col_count--;
}
/* Handle left over mac */
col_count = input_ch * kernel_y * kernel_x & 0x3;
while (col_count)
{
q7_t ker_a1 = *ker_a++;
q15_t ip_b1 = *ip_as_col++;
sum += ker_a1 * ip_b1;
col_count--;
}
if (bias_data)
{
q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i]);
q63_t acc_64 = sum + bias_data[i];
sum = arm_nn_requantize_s64(acc_64, reduced_multiplier, output_shift[i]);
}
else
{
sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
}
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
*out++ = (q15_t)sum;
}
}
#else
(void)input_data;
(void)output_data;
(void)bias_data;
(void)filter_data;
(void)buffer_a;
(void)kernel_x;
(void)kernel_y;
(void)pad_x;
(void)pad_y;
(void)stride_x;
(void)stride_y;
(void)out_activation_min;
(void)out_activation_max;
(void)output_mult;
(void)output_shift;
return ARM_MATH_ARGUMENT_ERROR;
#endif
/* Advance to the next batch */
input_data += (input_x * input_y * input_ch);
output_data += (output_x * output_y * output_ch);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
#else
(void)input_dims;
(void)filter_dims;
return 0;
#endif
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,156 @@
/*
* Copyright (C) 2010-2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_s16.c
* Description: s16 version of convolution using symmetric quantization.
*
* $Date: January 13, 2022
* $Revision: V.1.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Basic s16 convolution function.
*
* Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
* are multiples of 4 or atleast greater than 4.
*
*/
arm_status arm_convolve_s16(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data)
{
(void)bias_dims;
(void)ctx;
const int32_t input_batches = input_dims->n;
const int32_t input_x = input_dims->w;
const int32_t input_y = input_dims->h;
const int32_t input_ch = input_dims->c;
const int32_t kernel_x = filter_dims->w;
const int32_t kernel_y = filter_dims->h;
const int32_t output_x = output_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_ch = output_dims->c;
const int32_t pad_x = conv_params->padding.w;
const int32_t pad_y = conv_params->padding.h;
const int32_t stride_x = conv_params->stride.w;
const int32_t stride_y = conv_params->stride.h;
const int32_t dilation_x = conv_params->dilation.w;
const int32_t dilation_y = conv_params->dilation.h;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
for (int32_t i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
{
const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i_out_ch]);
for (int32_t base_idx_y = -pad_y, i_out_y = 0; i_out_y < output_y; base_idx_y += stride_y, i_out_y++)
{
for (int32_t base_idx_x = -pad_x, i_out_x = 0; i_out_x < output_x; base_idx_x += stride_x, i_out_x++)
{
int64_t conv_out_acc = 0;
const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y;
const int32_t ker_y_start = MAX(0, start_y_max);
const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x;
const int32_t ker_x_start = MAX(0, start_x_max);
const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y;
const int32_t ker_y_end = MIN(kernel_y, end_min_y);
const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x;
const int32_t ker_x_end = MIN(kernel_x, end_min_x);
for (int32_t i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
for (int32_t i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t in_row = base_idx_y + dilation_y * i_ker_y;
const int32_t in_col = base_idx_x + dilation_x * i_ker_x;
for (int32_t i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
conv_out_acc += input_data[(in_row * input_x + in_col) * input_ch + i_input_ch] *
filter_data[i_out_ch * input_ch * kernel_y * kernel_x +
(i_ker_y * kernel_x + i_ker_x) * input_ch + i_input_ch];
}
}
}
if (bias_data)
{
conv_out_acc += bias_data[i_out_ch];
}
int32_t conv_out = arm_nn_requantize_s64(conv_out_acc, reduced_multiplier, output_shift[i_out_ch]);
conv_out = MAX(conv_out, out_activation_min);
conv_out = MIN(conv_out, out_activation_max);
output_data[i_out_ch + (i_out_y * output_x + i_out_x) * output_ch] = (int16_t)conv_out;
}
}
}
/* Advance to the next batch */
input_data += (input_x * input_y * input_ch);
output_data += (output_x * output_y * output_ch);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
(void)input_dims;
(void)filter_dims;
return 0;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,335 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_s8.c
* Description: s8 version of convolution using symmetric quantization.
*
* $Date: December 14, 2021
* $Revision: V.2.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Basic s8 convolution function.
*
* Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
* are multiples of 4 or atleast greater than 4.
*
*/
arm_status arm_convolve_s8(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
(void)bias_dims;
if (ctx->buf == NULL && arm_convolve_s8_get_buffer_size(input_dims, filter_dims) > 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
q15_t *buffer_a = (q15_t *)ctx->buf;
const int32_t input_batches = input_dims->n;
const uint16_t input_x = input_dims->w;
const uint16_t input_y = input_dims->h;
const uint16_t input_ch = input_dims->c;
const uint16_t kernel_x = filter_dims->w;
const uint16_t kernel_y = filter_dims->h;
const uint16_t output_x = output_dims->w;
const uint16_t output_y = output_dims->h;
const uint16_t output_ch = output_dims->c;
const uint16_t pad_x = conv_params->padding.w;
const uint16_t pad_y = conv_params->padding.h;
const uint16_t stride_x = conv_params->stride.w;
const uint16_t stride_y = conv_params->stride.h;
const int32_t input_offset = conv_params->input_offset;
const int32_t out_offset = conv_params->output_offset;
const int32_t out_activation_min = conv_params->activation.min;
const int32_t out_activation_max = conv_params->activation.max;
int32_t *output_mult = quant_params->multiplier;
int32_t *output_shift = quant_params->shift;
int i_batch;
for (i_batch = 0; i_batch < input_batches; i_batch++)
{
#if defined(ARM_MATH_MVEI)
/* Generate upto four columns from the input tensor a GEMM computation */
q7_t *im2col_buf = (q7_t *)buffer_a;
q7_t *out = output_data;
int32_t buffer_fill_cnt = 0;
int32_t padded = 0;
const int32_t num_elem = kernel_x * kernel_y * input_ch;
const int32_t dilation_x = conv_params->dilation.w;
const int32_t dilation_y = conv_params->dilation.h;
/* This part implements the im2col function */
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int32_t base_idx_x = stride_x * i_out_x - pad_x;
const int32_t base_idx_y = stride_y * i_out_y - pad_y;
for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++)
{
for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
{
const int32_t k_y = base_idx_y + dilation_y * i_ker_y;
const int32_t k_x = base_idx_x + dilation_x * i_ker_x;
if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x)
{
memset(im2col_buf, (int8_t)-input_offset, sizeof(q7_t) * input_ch);
padded = 1;
}
else
{
arm_memcpy_q7(im2col_buf, input_data + (k_y * input_x + k_x) * input_ch, input_ch);
}
im2col_buf += input_ch;
}
}
buffer_fill_cnt++;
/* Computation is filed for every 4 columns */
if (buffer_fill_cnt == 4 && (padded == 0))
{
buffer_fill_cnt = 0;
out = arm_nn_mat_mul_core_4x_s8(num_elem,
num_elem,
(q7_t *)buffer_a,
filter_data,
output_ch,
conv_params,
quant_params,
bias_data,
out);
im2col_buf = (q7_t *)buffer_a;
}
else if (buffer_fill_cnt == 4 && (padded != 0))
{
buffer_fill_cnt = 0;
out = arm_nn_mat_mult_s8(filter_data,
(q7_t *)buffer_a,
output_ch,
4,
output_shift,
output_mult,
out_offset,
input_offset,
0,
out_activation_min,
out_activation_max,
num_elem,
bias_data,
out);
im2col_buf = (q7_t *)buffer_a;
padded = 0;
}
}
}
/* Handle left over columns */
if (buffer_fill_cnt != 0)
{
out = arm_nn_mat_mult_s8(filter_data,
(q7_t *)buffer_a,
output_ch,
buffer_fill_cnt,
output_shift,
output_mult,
out_offset,
input_offset,
0,
out_activation_min,
out_activation_max,
num_elem,
bias_data,
out);
}
#else // #if defined(ARM_MATH_MVEI)
const uint16_t dilation_x = conv_params->dilation.w;
const uint16_t dilation_y = conv_params->dilation.h;
int32_t i_out_y, i_out_x, i_ker_y, i_ker_x;
/* Generate two columns from the input tensor a GEMM computation */
q15_t *two_column_buf = buffer_a;
q7_t *out = output_data;
/* This part implements the im2col function */
for (i_out_y = 0; i_out_y < output_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int32_t base_idx_y = stride_y * i_out_y - pad_y;
const int32_t base_idx_x = stride_x * i_out_x - pad_x;
for (i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++)
{
for (i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
{
const int32_t k_y = base_idx_y + dilation_y * i_ker_y;
const int32_t k_x = base_idx_x + dilation_x * i_ker_x;
if (k_y < 0 || k_y >= input_y || k_x < 0 || k_x >= input_x)
{
/* Filling 0 for out-of-bound paddings */
memset(two_column_buf, 0, sizeof(q15_t) * input_ch);
}
else
{
/* Copying the pixel data to column */
arm_q7_to_q15_with_offset(
input_data + (k_y * input_x + k_x) * input_ch, two_column_buf, input_ch, input_offset);
}
two_column_buf += input_ch;
}
}
/* Computation is filed for every 2 columns */
if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
{
out = arm_nn_mat_mult_kernel_s8_s16(filter_data,
buffer_a,
output_ch,
output_shift,
output_mult,
out_offset,
out_activation_min,
out_activation_max,
input_ch * kernel_y * kernel_x,
bias_data,
out);
/* counter reset */
two_column_buf = buffer_a;
}
}
}
/* left-over because odd number of output pixels */
if (two_column_buf != buffer_a)
{
const q7_t *ker_a = filter_data;
int i;
for (i = 0; i < output_ch; i++)
{
/* Load the accumulator with bias first */
q31_t sum = 0;
if (bias_data)
{
sum = bias_data[i];
}
/* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
const q15_t *ip_as_col = buffer_a;
/* 4 multiply and accumulates are done in one loop. */
#if defined(ARM_MATH_DSP)
uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
while (col_count)
{
q31_t ker_a1, ker_a2;
q31_t ip_b1, ip_b2;
ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
sum = __SMLAD(ker_a1, ip_b1, sum);
ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
sum = __SMLAD(ker_a2, ip_b2, sum);
col_count--;
}
/* Handle left over mac */
col_count = input_ch * kernel_y * kernel_x & 0x3;
#else
uint16_t col_count = input_ch * kernel_y * kernel_x;
#endif
while (col_count)
{
q7_t ker_a1 = *ker_a++;
q15_t ip_b1 = *ip_as_col++;
sum += ker_a1 * ip_b1;
col_count--;
}
sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
sum += out_offset;
sum = MAX(sum, out_activation_min);
sum = MIN(sum, out_activation_max);
*out++ = (q7_t)sum;
}
}
#endif // #if defined(ARM_MATH_MVEI)
/* Advance to the next batch */
input_data += (input_x * input_y * input_ch);
output_data += (output_x * output_y * output_ch);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
#if defined(ARM_MATH_MVEI)
int32_t col_length = input_dims->c * filter_dims->w * filter_dims->h;
// Get number of complete int16 lanes(multiple of 8) for given col_length. This is dependent on
// implementation of arm_nn_mat_mult_s8
col_length = (col_length + 7) / 8;
// 4 -> number of im2col buffers, 8 -> 8 elements per Q register
return 4 * col_length * 8 * (int32_t)sizeof(int8_t);
#else
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
#endif
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,130 @@
/*
* Copyright (C) 2021-2022 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_wrapper_s16.c
* Description: s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in
* cmsis-nn to perform the convolution.
*
* $Date: 13 January 2022
* $Revision: V.1.2.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Convolution layer
*
* Refer header file for details.
*
*/
arm_status arm_convolve_wrapper_s16(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data)
{
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
if (filter_dims->w * filter_dims->h * input_dims->c < 512 &&
(conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_fast_s16(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
else
{
return arm_convolve_s16(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
#else
return arm_convolve_s16(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
#endif
}
int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims)
{
(void)conv_params;
(void)output_dims;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
if (filter_dims->w * filter_dims->h * input_dims->c < 512 &&
(conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_fast_s16_get_buffer_size(input_dims, filter_dims);
}
return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
#else
return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
#endif
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,133 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_convolve_wrapper_s8.c
* Description: s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in
* cmsis-nn to perform the convolution.
*
* $Date: 02. December 2021
* $Revision: V.1.1.0
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Convolution layer
*
* Refer header file for details.
*
*/
arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input_data,
const cmsis_nn_dims *filter_dims,
const q7_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
q7_t *output_data)
{
if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (input_dims->c % 4 == 0) &&
(conv_params->stride.w == 1) && (conv_params->stride.h == 1) && (filter_dims->w == 1) &&
(filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_1x1_s8_fast(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
else if ((output_dims->h == 1) && (input_dims->h == 1) && (filter_dims->h == 1) && (output_dims->w % 4 == 0) &&
(input_dims->n == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_1_x_n_s8(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
else
{
return arm_convolve_s8(ctx,
conv_params,
quant_params,
input_dims,
input_data,
filter_dims,
filter_data,
bias_dims,
bias_data,
output_dims,
output_data);
}
}
int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims)
{
if ((conv_params->padding.w == 0) && (conv_params->padding.h == 0) && (input_dims->c % 4 == 0) &&
(conv_params->stride.w == 1) && (conv_params->stride.h == 1) && (filter_dims->w == 1) &&
(filter_dims->h == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_1x1_s8_fast_get_buffer_size(input_dims);
}
else if ((output_dims->h == 1) && (input_dims->h == 1) && (filter_dims->h == 1) && (output_dims->w % 4 == 0) &&
(input_dims->n == 1) && (conv_params->dilation.w == 1 && conv_params->dilation.h == 1))
{
return arm_convolve_1_x_n_s8_get_buffer_size(input_dims, filter_dims);
}
else
{
return arm_convolve_s8_get_buffer_size(input_dims, filter_dims);
}
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,212 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_3x3_s8.c
* Description: Optimized s8 depthwise convolution function for channel
* multiplier of 1 and 3x3 kernel size.
*
* $Date: 09. October 2020
* $Revision: V.2.0.1
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Optimized s8 depthwise convolution function with constraint that
* in_channel == out_channel and kernel_x == kernel_y == 3 with pads at most 1
*
* Refer prototype header file for details.
*
*/
arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
(void)ctx;
(void)bias_dims;
const int32_t input_x = input_dims->w;
const int32_t input_y = input_dims->h;
const int32_t input_ch = input_dims->c;
const int32_t output_ch = output_dims->c;
const int32_t pad_x = dw_conv_params->padding.w;
const int32_t pad_y = dw_conv_params->padding.h;
const int32_t stride_x = dw_conv_params->stride.w;
const int32_t stride_y = dw_conv_params->stride.h;
const int32_t *output_shift = quant_params->shift;
const int32_t *output_mult = quant_params->multiplier;
const int32_t output_x = output_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_offset = dw_conv_params->output_offset;
const int32_t input_offset = dw_conv_params->input_offset;
const int32_t output_activation_min = dw_conv_params->activation.min;
const int32_t output_activation_max = dw_conv_params->activation.max;
/* Check input constraints input_ch == output_ch */
if (input_ch != output_ch)
{
return ARM_MATH_SIZE_MISMATCH;
}
/* Check input constraints pad_x <= 1 */
if (pad_x > 1 || filter_dims->w != 3 || filter_dims->h != 3)
{
return ARM_MATH_ARGUMENT_ERROR;
}
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
int32_t in_ch = 0;
int32_t ker_w_start = MAX(0, -in_w);
for (; in_ch <= (input_ch - 4); in_ch += 4)
{
int32_t out_buff0 = bias[in_ch + 0];
int32_t out_buff1 = bias[in_ch + 1];
int32_t out_buff2 = bias[in_ch + 2];
int32_t out_buff3 = bias[in_ch + 3];
const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
{
int32_t in_val = 0;
int32_t ker_val = 0;
if (ker_w_start == 0)
{
in_val = arm_nn_read_q7x4(input_ptr);
ker_val = arm_nn_read_q7x4(kernel_ptr);
out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
}
in_val = arm_nn_read_q7x4(input_ptr + input_ch);
ker_val = arm_nn_read_q7x4(kernel_ptr + input_ch);
out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
if ((input_x - in_w) >= 3)
{
in_val = arm_nn_read_q7x4(input_ptr + (input_ch << 1));
ker_val = arm_nn_read_q7x4(kernel_ptr + (input_ch << 1));
out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
}
input_ptr += (input_ch * input_x);
kernel_ptr += (input_ch * 3);
}
out_buff0 = arm_nn_requantize(out_buff0, output_mult[in_ch + 0], output_shift[in_ch + 0]);
out_buff1 = arm_nn_requantize(out_buff1, output_mult[in_ch + 1], output_shift[in_ch + 1]);
out_buff2 = arm_nn_requantize(out_buff2, output_mult[in_ch + 2], output_shift[in_ch + 2]);
out_buff3 = arm_nn_requantize(out_buff3, output_mult[in_ch + 3], output_shift[in_ch + 3]);
out_buff0 += output_offset;
out_buff1 += output_offset;
out_buff2 += output_offset;
out_buff3 += output_offset;
out_buff0 = MIN(MAX(out_buff0, output_activation_min), output_activation_max);
out_buff1 = MIN(MAX(out_buff1, output_activation_min), output_activation_max);
out_buff2 = MIN(MAX(out_buff2, output_activation_min), output_activation_max);
out_buff3 = MIN(MAX(out_buff3, output_activation_min), output_activation_max);
output[out_idx++] = (int8_t)out_buff0;
output[out_idx++] = (int8_t)out_buff1;
output[out_idx++] = (int8_t)out_buff2;
output[out_idx++] = (int8_t)out_buff3;
}
// Leftover
for (; in_ch < input_ch; ++in_ch)
{
int32_t out_buff = bias[in_ch];
const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
{
if (ker_w_start == 0)
{
out_buff += (*(input_ptr) + input_offset) * *(kernel_ptr);
}
out_buff += (*(input_ptr + input_ch) + input_offset) * *(kernel_ptr + input_ch);
if ((input_x - in_w) >= 3)
{
out_buff += (*(input_ptr + (input_ch << 1)) + input_offset) * *(kernel_ptr + (input_ch << 1));
}
input_ptr += (input_ch * input_x);
kernel_ptr += (input_ch * 3);
}
out_buff = arm_nn_requantize(out_buff, output_mult[in_ch], output_shift[in_ch]);
out_buff += output_offset;
out_buff = MIN(MAX(out_buff, output_activation_min), output_activation_max);
output[out_idx++] = (int8_t)out_buff;
}
}
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,292 @@
/*
* Copyright (C) 2022 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_s16.c
* Description: s16 version of depthwise convolution.
*
* $Date: 26. Jan 2022
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
static void __attribute__((unused)) depthwise_conv_s16_mult_4_s16(const int16_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const int8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int64_t *bias,
int16_t *output,
const int32_t *output_shift,
const int32_t *output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch;
++in_ch, out_ch += ch_mult)
{
for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
{
int32_t out_buff32[4] = {REDUCE_MULTIPLIER(output_mult[out_ch + 0 + mult_tile]),
REDUCE_MULTIPLIER(output_mult[out_ch + 1 + mult_tile]),
REDUCE_MULTIPLIER(output_mult[out_ch + 2 + mult_tile]),
REDUCE_MULTIPLIER(output_mult[out_ch + 3 + mult_tile])};
int64_t out_buff[4] = {0, 0, 0, 0};
if (bias)
{
out_buff[0] = bias[out_ch + 0 + mult_tile];
out_buff[1] = bias[out_ch + 1 + mult_tile];
out_buff[2] = bias[out_ch + 2 + mult_tile];
out_buff[3] = bias[out_ch + 3 + mult_tile];
}
for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
{
int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
#pragma clang loop unroll(disable)
#endif
for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w);
++ker_w, ker_idx += output_ch)
{
// TODO: Unroll of 4 with 64 bit accumulator will probably result in too much register
// spills. Try with unroll of 2 when enabling this.
int32_t in_val = input[in_idx + ker_w * input_ch];
out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile];
out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile];
out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile];
out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile];
}
}
out_buff32[0] =
arm_nn_requantize_s64(out_buff[0], out_buff32[0], output_shift[out_ch + 0 + mult_tile]);
out_buff32[1] =
arm_nn_requantize_s64(out_buff[1], out_buff32[1], output_shift[out_ch + 1 + mult_tile]);
out_buff32[2] =
arm_nn_requantize_s64(out_buff[2], out_buff32[2], output_shift[out_ch + 2 + mult_tile]);
out_buff32[3] =
arm_nn_requantize_s64(out_buff[3], out_buff32[3], output_shift[out_ch + 3 + mult_tile]);
out_buff32[0] = MIN(MAX(out_buff32[0], output_activation_min), output_activation_max);
out_buff32[1] = MIN(MAX(out_buff32[1], output_activation_min), output_activation_max);
out_buff32[2] = MIN(MAX(out_buff32[2], output_activation_min), output_activation_max);
out_buff32[3] = MIN(MAX(out_buff32[3], output_activation_min), output_activation_max);
output[out_idx++] = (int16_t)out_buff32[0];
output[out_idx++] = (int16_t)out_buff32[1];
output[out_idx++] = (int16_t)out_buff32[2];
output[out_idx++] = (int16_t)out_buff32[3];
}
}
}
}
}
static void depthwise_conv_s16_generic_s16(const int16_t *input,
const uint16_t input_batches,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_ch,
const int8_t *kernel,
const uint16_t ch_mult,
const uint16_t kernel_x,
const uint16_t kernel_y,
const uint16_t pad_x,
const uint16_t pad_y,
const uint16_t stride_x,
const uint16_t stride_y,
const int64_t *bias,
int16_t *output,
const int32_t *output_shift,
const int32_t *output_mult,
const uint16_t output_x,
const uint16_t output_y,
const int32_t output_activation_min,
const int32_t output_activation_max,
const uint16_t dilation_x,
const uint16_t dilation_y)
{
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
{
const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
const q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[idx_out_ch]);
int64_t acc_0 = 0;
int ker_y_start;
int ker_x_start;
int ker_y_end;
int ker_x_end;
if (dilation_x > 1)
{
const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x;
ker_x_start = MAX(0, start_x_max);
const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x;
ker_x_end = MIN(kernel_x, end_min_x);
}
else
{
ker_x_start = MAX(0, -base_idx_x);
ker_x_end = MIN(kernel_x, input_x - base_idx_x);
}
if (dilation_y > 1)
{
const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y;
ker_y_start = MAX(0, start_y_max);
const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y;
ker_y_end = MIN(kernel_y, end_min_y);
}
else
{
ker_y_start = MAX(0, -base_idx_y);
ker_y_end = MIN(kernel_y, input_y - base_idx_y);
}
if (bias)
{
acc_0 = bias[idx_out_ch];
}
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + dilation_y * i_ker_y;
for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t idx_x = base_idx_x + dilation_x * i_ker_x;
int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
acc_0 += input[idx_0] * kernel[ker_idx_0];
}
}
/* Requantize and clamp output to provided range */
int32_t result = arm_nn_requantize_s64(acc_0, reduced_multiplier, output_shift[idx_out_ch]);
result = MAX(result, output_activation_min);
result = MIN(result, output_activation_max);
*output++ = (int16_t)result;
}
}
}
}
/* Advance to the next batch */
input += (input_x * input_y * input_ch);
}
}
/*
* Basic s16 depthwise convolution function.
*
* Refer header file for details.
*
*/
arm_status arm_depthwise_conv_s16(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q15_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int64_t *bias,
const cmsis_nn_dims *output_dims,
q15_t *output)
{
const uint16_t dilation_x = dw_conv_params->dilation.w;
const uint16_t dilation_y = dw_conv_params->dilation.h;
(void)bias_dims;
(void)ctx;
depthwise_conv_s16_generic_s16(input,
input_dims->n,
input_dims->w,
input_dims->h,
input_dims->c,
kernel,
dw_conv_params->ch_mult,
filter_dims->w,
filter_dims->h,
dw_conv_params->padding.w,
dw_conv_params->padding.h,
dw_conv_params->stride.w,
dw_conv_params->stride.h,
bias,
output,
quant_params->shift,
quant_params->multiplier,
output_dims->w,
output_dims->h,
dw_conv_params->activation.min,
dw_conv_params->activation.max,
dilation_x,
dilation_y);
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,347 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_s8.c
* Description: s8 version of depthwise convolution.
*
* $Date: 30. Dec 2021
* $Revision: V.2.7.1
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
static void depthwise_conv_s8_mult_4(const int8_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const int8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int32_t *bias,
int8_t *output,
const int32_t *output_shift,
const int32_t *output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch;
++in_ch, out_ch += ch_mult)
{
for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
{
int32_t out_buff[4] = {0, 0, 0, 0};
if (bias)
{
out_buff[0] = bias[out_ch + 0 + mult_tile];
out_buff[1] = bias[out_ch + 1 + mult_tile];
out_buff[2] = bias[out_ch + 2 + mult_tile];
out_buff[3] = bias[out_ch + 3 + mult_tile];
}
for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
{
int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
#pragma clang loop unroll(disable)
#endif
for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w);
++ker_w, ker_idx += output_ch)
{
int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset;
out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile];
out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile];
out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile];
out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile];
}
}
#if defined(ARM_MATH_MVEI)
(void)out_idx;
int32x4_t res = vldrwq_s32(out_buff);
res = arm_requantize_mve_32x4(res,
vldrwq_s32(&output_mult[out_ch + mult_tile]),
vldrwq_s32(&output_shift[out_ch + mult_tile]));
res = vaddq_n_s32(res, output_offset);
res = vmaxq_s32(res, vdupq_n_s32(output_activation_min));
res = vminq_s32(res, vdupq_n_s32(output_activation_max));
vstrbq_s32(output, res);
output += 4;
#else
out_buff[0] = arm_nn_requantize(
out_buff[0], output_mult[out_ch + 0 + mult_tile], output_shift[out_ch + 0 + mult_tile]);
out_buff[1] = arm_nn_requantize(
out_buff[1], output_mult[out_ch + 1 + mult_tile], output_shift[out_ch + 1 + mult_tile]);
out_buff[2] = arm_nn_requantize(
out_buff[2], output_mult[out_ch + 2 + mult_tile], output_shift[out_ch + 2 + mult_tile]);
out_buff[3] = arm_nn_requantize(
out_buff[3], output_mult[out_ch + 3 + mult_tile], output_shift[out_ch + 3 + mult_tile]);
out_buff[0] += output_offset;
out_buff[1] += output_offset;
out_buff[2] += output_offset;
out_buff[3] += output_offset;
out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max);
out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max);
out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max);
out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max);
output[out_idx++] = (int8_t)out_buff[0];
output[out_idx++] = (int8_t)out_buff[1];
output[out_idx++] = (int8_t)out_buff[2];
output[out_idx++] = (int8_t)out_buff[3];
#endif
}
}
}
}
}
static void depthwise_conv_s8_generic(const q7_t *input,
const uint16_t input_batches,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_ch,
const q7_t *kernel,
const uint16_t output_ch,
const uint16_t ch_mult,
const uint16_t kernel_x,
const uint16_t kernel_y,
const uint16_t pad_x,
const uint16_t pad_y,
const uint16_t stride_x,
const uint16_t stride_y,
const int32_t *bias,
q7_t *output,
const int32_t *output_shift,
const int32_t *output_mult,
const uint16_t output_x,
const uint16_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t output_activation_min,
const int32_t output_activation_max,
const uint16_t dilation_x,
const uint16_t dilation_y)
{
(void)output_ch;
int i_out = 0;
int i_batch;
for (i_batch = 0; i_batch < input_batches; i_batch++)
{
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
{
const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
int32_t acc_0 = 0;
int ker_y_start;
int ker_x_start;
int ker_y_end;
int ker_x_end;
if (dilation_x > 1)
{
const int32_t start_x_max = (-base_idx_x + dilation_x - 1) / dilation_x;
ker_x_start = MAX(0, start_x_max);
const int32_t end_min_x = (input_x - base_idx_x + dilation_x - 1) / dilation_x;
ker_x_end = MIN(kernel_x, end_min_x);
}
else
{
ker_x_start = MAX(0, -base_idx_x);
ker_x_end = MIN(kernel_x, input_x - base_idx_x);
}
if (dilation_y > 1)
{
const int32_t start_y_max = (-base_idx_y + dilation_y - 1) / dilation_y;
ker_y_start = MAX(0, start_y_max);
const int32_t end_min_y = (input_y - base_idx_y + dilation_y - 1) / dilation_y;
ker_y_end = MIN(kernel_y, end_min_y);
}
else
{
ker_y_start = MAX(0, -base_idx_y);
ker_y_end = MIN(kernel_y, input_y - base_idx_y);
}
if (bias)
{
acc_0 = bias[idx_out_ch];
}
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + dilation_y * i_ker_y;
for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t idx_x = base_idx_x + dilation_x * i_ker_x;
int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
acc_0 += (input[idx_0] + input_offset) * kernel[ker_idx_0];
}
}
/* Requantize and clamp output to provided range */
acc_0 = arm_nn_requantize(acc_0, output_mult[idx_out_ch], output_shift[idx_out_ch]);
acc_0 += output_offset;
acc_0 = MAX(acc_0, output_activation_min);
acc_0 = MIN(acc_0, output_activation_max);
output[i_out++] = acc_0;
}
}
}
}
/* Advance to the next batch */
input += (input_x * input_y * input_ch);
}
}
/*
* Basic s8 depthwise convolution function.
*
* Refer header file for details.
* Optimization using DSP extension is not available for the generic case where channel multiplier is > 1.
*
*/
arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
const uint16_t dilation_x = dw_conv_params->dilation.w;
const uint16_t dilation_y = dw_conv_params->dilation.h;
(void)dw_conv_params->dilation;
(void)bias_dims;
(void)ctx;
if (dw_conv_params->ch_mult % 4 == 0 && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
dw_conv_params->dilation.h == 1)
{
depthwise_conv_s8_mult_4(input,
input_dims->w,
input_dims->h,
input_dims->c,
kernel,
output_dims->c,
dw_conv_params->ch_mult,
filter_dims->w,
filter_dims->h,
dw_conv_params->padding.w,
dw_conv_params->padding.h,
dw_conv_params->stride.w,
dw_conv_params->stride.h,
bias,
output,
quant_params->shift,
quant_params->multiplier,
output_dims->w,
output_dims->h,
dw_conv_params->output_offset,
dw_conv_params->input_offset,
dw_conv_params->activation.min,
dw_conv_params->activation.max);
}
else
{
depthwise_conv_s8_generic(input,
input_dims->n,
input_dims->w,
input_dims->h,
input_dims->c,
kernel,
output_dims->c,
dw_conv_params->ch_mult,
filter_dims->w,
filter_dims->h,
dw_conv_params->padding.w,
dw_conv_params->padding.h,
dw_conv_params->stride.w,
dw_conv_params->stride.h,
bias,
output,
quant_params->shift,
quant_params->multiplier,
output_dims->w,
output_dims->h,
dw_conv_params->output_offset,
dw_conv_params->input_offset,
dw_conv_params->activation.min,
dw_conv_params->activation.max,
dilation_x,
dilation_y);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,433 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_s8_opt.c
* Description: Optimized s8 depthwise separable convolution function for
* channel multiplier of 1.
*
* $Date: January 26, 2021
* $Revision: V.2.0.3
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel
*
* Refer prototype header file for details.
*
*/
arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
const int32_t input_ch = input_dims->c;
const int32_t output_ch = output_dims->c;
/* Check input constraints input_ch == output_ch */
if (input_ch != output_ch)
{
return ARM_MATH_SIZE_MISMATCH;
}
if (ctx->buf == NULL && arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims) > 0)
{
return ARM_MATH_ARGUMENT_ERROR;
}
#ifdef ARM_MATH_DSP
const int32_t input_x = input_dims->w;
const int32_t input_y = input_dims->h;
const int32_t kernel_x = filter_dims->w;
const int32_t kernel_y = filter_dims->h;
const int32_t pad_x = dw_conv_params->padding.w;
const int32_t pad_y = dw_conv_params->padding.h;
const int32_t stride_x = dw_conv_params->stride.w;
const int32_t stride_y = dw_conv_params->stride.h;
const int32_t *output_shift = quant_params->shift;
const int32_t *output_mult = quant_params->multiplier;
const int32_t output_x = output_dims->w;
const int32_t output_y = output_dims->h;
const int32_t output_offset = dw_conv_params->output_offset;
const int32_t input_offset = dw_conv_params->input_offset;
const int32_t output_activation_min = dw_conv_params->activation.min;
const int32_t output_activation_max = dw_conv_params->activation.max;
q15_t *buffer_a = (q15_t *)ctx->buf;
#ifdef ARM_MATH_MVEI
(void)bias_dims;
/* Generate two columns from the input tensor */
q7_t *lhs_buffer = (q7_t *)buffer_a;
q7_t *out = output;
int padded = 0;
int buffer_count = 0;
const int32_t kernel_size = kernel_x * kernel_y;
/* This part implements the im2col function */
for (int i_out_y = 0, base_idx_y = -pad_y; i_out_y < output_y; base_idx_y += stride_y, i_out_y++)
{
for (int i_out_x = 0, base_idx_x = -pad_x; i_out_x < output_x; base_idx_x += stride_x, i_out_x++)
{
for (int i_ker_y = base_idx_y; i_ker_y < base_idx_y + kernel_y; i_ker_y++)
{
for (int i_ker_x = base_idx_x; i_ker_x < base_idx_x + kernel_x; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
{
arm_memset_q7(lhs_buffer, (int8_t)-input_offset, (uint32_t)input_ch);
padded = 1;
}
else
{
arm_memcpy_q7(lhs_buffer, input + (i_ker_y * input_x + i_ker_x) * input_ch, (uint32_t)input_ch);
}
lhs_buffer += input_ch;
}
}
buffer_count++;
if (buffer_count == 4)
{
lhs_buffer = (q7_t *)buffer_a;
if (padded == 0)
{
out = arm_nn_depthwise_conv_nt_t_s8(lhs_buffer,
kernel,
input_offset,
input_ch,
output_shift,
output_mult,
output_offset,
output_activation_min,
output_activation_max,
kernel_size,
bias,
out);
}
else
{
out = arm_nn_depthwise_conv_nt_t_padded_s8(lhs_buffer,
kernel,
input_offset,
input_ch,
output_shift,
output_mult,
output_offset,
output_activation_min,
output_activation_max,
kernel_size,
bias,
out);
padded = 0;
}
buffer_count = 0;
}
}
}
/* Handle left over buffers */
lhs_buffer = (q7_t *)buffer_a;
for (int i_buf = 0; i_buf < buffer_count; i_buf++)
{
int32_t loop_count = (input_ch + 3) / 4;
int32_t num_ch_to_process = input_ch;
for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count; num_ch_to_process -= 4, offset += 4, i_loop_cnt++)
{
const int8_t *col_0 = lhs_buffer + (kernel_size * input_ch * i_buf) + offset;
const int8_t *row_0 = kernel + offset;
int32x4_t out_0 = vldrwq_s32(&bias[offset]);
for (int i_ker = 0; i_ker < kernel_size; i_ker++)
{
const int32x4_t ker_0 = vldrbq_s32(row_0);
int32x4_t ip_0 = vldrbq_s32(col_0);
ip_0 = vaddq_n_s32(ip_0, input_offset);
out_0 += vmulq_s32(ip_0, ker_0);
col_0 += input_ch;
row_0 += input_ch;
}
const int32x4_t mult = vldrwq_s32(&output_mult[offset]);
const int32x4_t shift = vldrwq_s32(&output_shift[offset]);
out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
out_0 = vaddq_n_s32(out_0, output_offset);
out_0 = vmaxq_s32(out_0, vdupq_n_s32(output_activation_min));
out_0 = vminq_s32(out_0, vdupq_n_s32(output_activation_max));
mve_pred16_t p = vctp32q((uint32_t)num_ch_to_process);
vstrbq_p_s32(out, out_0, p);
out += 4;
}
const int tail_ch = input_ch & 0x3;
if (tail_ch != 0)
{
out -= (4 - tail_ch);
}
}
#else // ARM_MATH_DSP
(void)bias_dims;
/* Run the following code in cores using DSP extension */
q15_t *const col_buffer_start = buffer_a;
q15_t *col_buffer = col_buffer_start;
const int32_t *const bias_start_pos = bias;
const q31_t *const out_mult_start_pos = output_mult;
const q31_t *const out_shift_start_pos = output_shift;
uint16_t row_count;
uint16_t row_shift;
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
/* Out of bounds is only considered for the y axis as it provides a contiguous zero'ing opportunity than
along the x axis */
const int ker_y_start = MAX(0, -base_idx_y);
/* Condition for kernel end dimension: (base_idx_y + ker_y_end) < input_y */
const int ker_y_end = MIN(kernel_y, input_y - base_idx_y);
int32_t index = 0;
if (ker_y_start != 0)
{
memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(q15_t));
index += (kernel_x * input_ch) * ker_y_start;
}
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + i_ker_y;
for (int i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
{
const int32_t idx_x = base_idx_x + i_ker_x;
if (idx_x < 0 || idx_x >= input_x)
{
memset(&col_buffer[index], 0, input_ch * sizeof(q15_t));
}
else
{
arm_q7_to_q15_with_offset((q7_t *)input + (idx_y * input_x + idx_x) * input_ch,
&col_buffer[index],
input_ch,
input_offset);
}
index += input_ch;
}
}
const int diff = kernel_y - ker_y_end;
if (diff != 0)
{
memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(q15_t));
}
row_count = output_ch / 4;
row_shift = 0;
bias = bias_start_pos;
output_mult = out_mult_start_pos;
output_shift = out_shift_start_pos;
while (row_count)
{
q31_t sum = *bias++;
q31_t sum_2 = *bias++;
q31_t sum_3 = *bias++;
q31_t sum_4 = *bias++;
uint16_t col_count = (kernel_x * kernel_y) / 2;
q15_t *col_pos = col_buffer_start + row_shift;
const q7_t *row_pos = kernel + row_shift;
row_shift += 4;
while (col_count)
{
/* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to
use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */
/* Note: variable names can be improved here to align with rows and columns. */
q31_t ip_a1, ip_a2, ip_b1, ip_b2, op_a, op_b, op_c;
/* Read 4 weights */
ip_b1 = arm_nn_read_q7x4(row_pos);
ip_a1 = arm_nn_read_q7x4(row_pos + input_ch);
op_a = arm_nn_read_q15x2(col_pos);
op_b = arm_nn_read_q15x2(col_pos + input_ch);
ip_a2 = __SXTB16(ip_b1);
ip_b1 = __SXTB16(__ROR(ip_b1, 8));
ip_b2 = __SXTB16(ip_a1);
ip_a1 = __SXTB16(__ROR(ip_a1, 8));
op_c = __PKHBT(op_b, op_a, 16);
op_a = __PKHTB(op_b, op_a, 16);
op_b = __PKHBT(ip_b2, ip_a2, 16);
sum = __SMLAD(op_c, op_b, sum);
op_b = __PKHBT(ip_b1, ip_a1, 16);
sum_2 = __SMLAD(op_a, op_b, sum_2);
op_a = arm_nn_read_q15x2(col_pos + 2);
op_b = arm_nn_read_q15x2(col_pos + input_ch + 2);
op_c = __PKHBT(op_b, op_a, 16);
op_a = __PKHTB(op_b, op_a, 16);
op_b = __PKHTB(ip_a2, ip_b2, 16);
sum_3 = __SMLAD(op_c, op_b, sum_3);
op_b = __PKHTB(ip_a1, ip_b1, 16);
sum_4 = __SMLAD(op_a, op_b, sum_4);
row_pos += input_ch << 1;
col_pos += input_ch << 1;
col_count--;
}
col_count = (kernel_x * kernel_y) & 0x1;
while (col_count)
{
sum += row_pos[0] * col_pos[0];
sum_2 += row_pos[1] * col_pos[1];
sum_3 += row_pos[2] * col_pos[2];
sum_4 += row_pos[3] * col_pos[3];
row_pos += input_ch;
col_pos += input_ch;
col_count--;
}
sum = arm_nn_requantize(sum, *output_mult++, *output_shift++);
sum += output_offset;
sum = MAX(sum, output_activation_min);
sum = MIN(sum, output_activation_max);
*output++ = (q7_t)sum;
sum_2 = arm_nn_requantize(sum_2, *output_mult++, *output_shift++);
sum_2 += output_offset;
sum_2 = MAX(sum_2, output_activation_min);
sum_2 = MIN(sum_2, output_activation_max);
*output++ = (q7_t)sum_2;
sum_3 = arm_nn_requantize(sum_3, *output_mult++, *output_shift++);
sum_3 += output_offset;
sum_3 = MAX(sum_3, output_activation_min);
sum_3 = MIN(sum_3, output_activation_max);
*output++ = (q7_t)sum_3;
sum_4 = arm_nn_requantize(sum_4, *output_mult++, *output_shift++);
sum_4 += output_offset;
sum_4 = MAX(sum_4, output_activation_min);
sum_4 = MIN(sum_4, output_activation_max);
*output++ = (q7_t)sum_4;
row_count--;
}
row_count = output_ch & 0x3;
while (row_count)
{
q15_t *col_pos = col_buffer_start + row_shift;
const q7_t *row_pos = kernel + row_shift;
q31_t sum = *bias++;
const uint16_t col_count = (kernel_x * kernel_y);
row_shift += 1;
for (int i = 0; i < col_count; i++)
{
sum += row_pos[i * input_ch] * col_pos[i * input_ch];
}
sum = arm_nn_requantize(sum, *output_mult++, *output_shift++);
sum += output_offset;
sum = MAX(sum, output_activation_min);
sum = MIN(sum, output_activation_max);
*output++ = (q7_t)sum;
row_count--;
}
// clear counter and pointers
col_buffer = col_buffer_start;
}
}
#endif
#else
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
return arm_depthwise_conv_s8(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
kernel,
bias_dims,
bias,
output_dims,
output);
#endif /* ARM_MATH_MVEI | ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
{
#if defined(ARM_MATH_MVEI)
/* The + 4 accounts for out of bounds read of the lhs buffers in the *_nt_t_* functions. */
return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t) + 4;
#elif defined(ARM_MATH_DSP)
return (input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
#else
(void)input_dims;
(void)filter_dims;
return 0;
#endif
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,336 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_u8_basic_ver1.c
* Description: u8 depthwise convolution function
*
* $Date: 09. October 2020
* $Revision: V.1.1.1
*
* Target : Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
static void depthwise_conv_u8_mult_4(const uint8_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const uint8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int32_t *bias,
uint8_t *output,
const int32_t output_shift,
const int32_t output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t filter_offset,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
{
for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
{
for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch;
++in_ch, out_ch += ch_mult)
{
for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
{
int32_t out_buff[4];
out_buff[0] = 0;
out_buff[1] = 0;
out_buff[2] = 0;
out_buff[3] = 0;
for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
{
int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w);
++ker_w, ker_idx += output_ch)
{
int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset;
out_buff[0] += in_val * (kernel[ker_idx + 0 + mult_tile] + filter_offset);
out_buff[1] += in_val * (kernel[ker_idx + 1 + mult_tile] + filter_offset);
out_buff[2] += in_val * (kernel[ker_idx + 2 + mult_tile] + filter_offset);
out_buff[3] += in_val * (kernel[ker_idx + 3 + mult_tile] + filter_offset);
}
}
if (bias != NULL)
{
out_buff[0] += bias[out_ch + 0 + mult_tile];
out_buff[1] += bias[out_ch + 1 + mult_tile];
out_buff[2] += bias[out_ch + 2 + mult_tile];
out_buff[3] += bias[out_ch + 3 + mult_tile];
}
out_buff[0] = arm_nn_requantize(out_buff[0], output_mult, output_shift);
out_buff[1] = arm_nn_requantize(out_buff[1], output_mult, output_shift);
out_buff[2] = arm_nn_requantize(out_buff[2], output_mult, output_shift);
out_buff[3] = arm_nn_requantize(out_buff[3], output_mult, output_shift);
out_buff[0] += output_offset;
out_buff[1] += output_offset;
out_buff[2] += output_offset;
out_buff[3] += output_offset;
out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max);
out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max);
out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max);
out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max);
output[out_idx++] = (uint8_t)out_buff[0];
output[out_idx++] = (uint8_t)out_buff[1];
output[out_idx++] = (uint8_t)out_buff[2];
output[out_idx++] = (uint8_t)out_buff[3];
}
}
}
}
}
static void depthwise_conv_u8_generic(const uint8_t *input,
const int32_t input_x,
const int32_t input_y,
const int32_t input_ch,
const uint8_t *kernel,
const int32_t output_ch,
const int32_t ch_mult,
const int32_t kernel_x,
const int32_t kernel_y,
const int32_t pad_x,
const int32_t pad_y,
const int32_t stride_x,
const int32_t stride_y,
const int32_t *bias,
uint8_t *output,
const int32_t output_shift,
const int32_t output_mult,
const int32_t output_x,
const int32_t output_y,
const int32_t output_offset,
const int32_t input_offset,
const int32_t filter_offset,
const int32_t output_activation_min,
const int32_t output_activation_max)
{
(void)output_ch;
int i_out = 0;
for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
{
const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
{
const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
{
for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
{
const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
int32_t acc_0;
/* Condition for kernel start dimension: (base_idx_<x,y> + ker_<x,y>_start) >= 0 */
const int ker_y_start = MAX(0, -base_idx_y);
const int ker_x_start = MAX(0, -base_idx_x);
/* Condition for kernel end dimension: (base_idx_<x,y> + ker_<x,y>_end) < input_<x,y> */
const int ker_y_end = MIN(kernel_y, input_y - base_idx_y);
const int ker_x_end = MIN(kernel_x, input_x - base_idx_x);
acc_0 = 0;
for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
{
const int32_t idx_y = base_idx_y + i_ker_y;
for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
{
const int32_t idx_x = base_idx_x + i_ker_x;
int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
acc_0 += (input[idx_0] + input_offset) * (kernel[ker_idx_0] + filter_offset);
}
}
if (bias != NULL)
{
acc_0 += bias[idx_out_ch];
}
/* Requantize and clamp output to provided range */
acc_0 = arm_nn_requantize(acc_0, output_mult, output_shift);
acc_0 += output_offset;
acc_0 = MAX(acc_0, output_activation_min);
acc_0 = MIN(acc_0, output_activation_max);
output[i_out++] = acc_0;
}
}
}
}
}
/**
* @brief uint8 depthwise convolution function with asymmetric quantization
*
* @param[in] input Pointer to input tensor
* @param[in] input_x Width of input tensor
* @param[in] input_y Height of input tensor
* @param[in] input_ch Channels in input tensor
* @param[in] kernel Pointer to kernel weights
* @param[in] kernel_x Width of kernel
* @param[in] kernel_y Height of kernel
* @param[in] ch_mult Number of channel multiplier
* @param[in] pad_x Padding sizes x
* @param[in] pad_y Padding sizes y
* @param[in] stride_x Convolution stride along the width
* @param[in] stride_y Convolution stride along the height
* @param[in] dilation_x Dilation along width. Not used and intended for future enhancement.
* @param[in] dilation_y Dilation along height. Not used and intended for future enhancement.
* @param[in] bias Pointer to optional bias values. If no bias is
* available, NULL is expected
* @param[in] input_offset Input tensor zero offset
* @param[in] filter_offset Kernel tensor zero offset
* @param[in] output_offset Output tensor zero offset
* @param[in,out] output Pointer to output tensor
* @param[in] output_x Width of output tensor
* @param[in] output_y Height of output tensor
* @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255}
* @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255}
* @param[in] output_shift Amount of right-shift for output
* @param[in] output_mult Output multiplier for requantization
* @return The function returns one of the following
* <code>ARM_MATH_SIZE_MISMATCH</code> - Not supported dimension of tensors
* <code>ARM_MATH_SUCCESS</code> - Successful operation
* <code>ARM_MATH_ARGUMENT_ERROR</code> - Implementation not available
*
*
*/
arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input,
const uint16_t input_x,
const uint16_t input_y,
const uint16_t input_ch,
const uint8_t *kernel,
const uint16_t kernel_x,
const uint16_t kernel_y,
const int16_t ch_mult,
const int16_t pad_x,
const int16_t pad_y,
const int16_t stride_x,
const int16_t stride_y,
const int16_t dilation_x,
const int16_t dilation_y,
const int32_t *bias,
const int32_t input_offset,
const int32_t filter_offset,
const int32_t output_offset,
uint8_t *output,
const uint16_t output_x,
const uint16_t output_y,
const int32_t output_activation_min,
const int32_t output_activation_max,
const int32_t output_shift,
const int32_t output_mult)
{
(void)dilation_x;
(void)dilation_y;
if (ch_mult % 4 == 0)
{
depthwise_conv_u8_mult_4(input,
input_x,
input_y,
input_ch,
kernel,
ch_mult * input_ch,
ch_mult,
kernel_x,
kernel_y,
pad_x,
pad_y,
stride_x,
stride_y,
bias,
output,
output_shift,
output_mult,
output_x,
output_y,
output_offset,
input_offset,
filter_offset,
output_activation_min,
output_activation_max);
}
else
{
depthwise_conv_u8_generic(input,
input_x,
input_y,
input_ch,
kernel,
ch_mult * input_ch,
ch_mult,
kernel_x,
kernel_y,
pad_x,
pad_y,
stride_x,
stride_y,
bias,
output,
output_shift,
output_mult,
output_x,
output_y,
output_offset,
input_offset,
filter_offset,
output_activation_min,
output_activation_max);
}
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,135 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_conv_wrapper_s8.c
* Description: Wrapper API to select appropriate depthwise conv API based
* on dimensions.
*
* $Date: 20. Dec 2021
* $Revision: V.1.4.0
*
* Target Processor: Cortex-M CPUs
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/*
* s8 Depthwise conv wrapper function
*
* Refer header file for details.
*
*/
arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const q7_t *input,
const cmsis_nn_dims *filter_dims,
const q7_t *filter,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
q7_t *output)
{
arm_status status = ARM_MATH_SUCCESS;
if (1 == dw_conv_params->ch_mult && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
dw_conv_params->dilation.h == 1)
{
#if !defined(ARM_MATH_MVEI)
if ((filter_dims->w == 3) && (filter_dims->h == 3) && (dw_conv_params->padding.h <= 1) &&
(dw_conv_params->padding.w <= 1))
{
status = arm_depthwise_conv_3x3_s8(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
filter,
bias_dims,
bias,
output_dims,
output);
}
else
#endif
{
status = arm_depthwise_conv_s8_opt(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
filter,
bias_dims,
bias,
output_dims,
output);
}
}
else
{
status = arm_depthwise_conv_s8(ctx,
dw_conv_params,
quant_params,
input_dims,
input,
filter_dims,
filter,
bias_dims,
bias,
output_dims,
output);
}
/* Return to application */
return status;
}
int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims)
{
(void)dw_conv_params;
int32_t size = 0;
if (input_dims->c == output_dims->c && input_dims->n == 1 && dw_conv_params->dilation.w == 1 &&
dw_conv_params->dilation.h == 1)
{
size = arm_depthwise_conv_s8_opt_get_buffer_size(input_dims, filter_dims);
}
return size;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,422 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_separable_conv_HWC_q7.c
* Description: Q7 depthwise separable convolution function
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Q7 depthwise separable convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimension
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel filter kernel size
* @param[in] padding padding sizes
* @param[in] stride convolution stride
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out output tensor dimension
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
*
* <b>Buffer size:</b>
*
* bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
*
* bufferB size: 0
*
* <b>Input dimension constraints:</b>
*
* ch_im_in equals ch_im_out
*
* Implementation:
* There are 3 nested loop here:
* Inner loop: calculate each output value with MAC instruction over an accumulator
* Mid loop: loop over different output channel
* Outer loop: loop over different output (x, y)
*/
arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in,
const uint16_t dim_im_in,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel,
const uint16_t padding,
const uint16_t stride,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
int16_t i_out_y, i_out_x;
int16_t i_ker_y, i_ker_x;
q7_t *colBuffer = (q7_t *)bufferA;
q7_t *pBuffer = colBuffer;
const q7_t *pBias = bias;
q7_t *pOut = Im_out;
uint16_t rowCnt;
uint16_t row_shift;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
/* we first do im2col here */
for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++)
{
for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in)
{
/* arm_fill_q7(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, ch_im_in);
}
else
{
/* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
*/
memcpy(pBuffer, (q7_t *)Im_in + (i_ker_y * dim_im_in + i_ker_x) * ch_im_in, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* we will do the computation here for each channel */
rowCnt = ch_im_out >> 2;
row_shift = 0;
pBias = bias;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel * dim_kernel) >> 1;
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
row_shift += 4;
#ifdef USE_INTRINSIC
#ifndef ARM_MATH_BIG_ENDIAN
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHTB(opB, inB1, 16);
inB1 = __PKHBT(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHTB(opB, inA1, 16);
inA1 = __PKHBT(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum3 = __SMLAD(opA, opB, sum3);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum4 = __SMLAD(opA, opB, sum4);
colCnt--;
}
#else
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHBT(opB, inB1, 16);
inB1 = __PKHTB(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHBT(opB, inA1, 16);
inA1 = __PKHTB(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum4 = __SMLAD(opA, opB, sum4);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum3 = __SMLAD(opA, opB, sum3);
colCnt--;
}
#endif /* ARM_MATH_BIG_ENDIAN */
#else
#ifndef ARM_MATH_BIG_ENDIAN
/*
* r0 r1 r2 r3 r4 r5
* inA1, inA2, inB1, inB2, opA, opB
*/
asm volatile("COL_LOOP_%=:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhtb r3, r5, r2, ASR #16\n"
"pkhbt r2, r2, r5, LSL #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhtb r1, r5, r0, ASR #16\n"
"pkhbt r0, r0, r5, LSL #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum], r4, r5, %[sum]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
: "r0", "r1", "r2", "r3", "r4", "r5");
#else
/*
* r0 r1 r2 r3 r4 r5
* inA1, inA2, inB1, inB2, opA, opB
*/
asm volatile("COL_LOOP_%=:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhbt r3, r5, r2, LSL #16\n"
"pkhtb r2, r2, r5, ASR #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhbt r1, r5, r0, LSL #16\n"
"pkhtb r0, r0, r5, ASR #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum], r4, r5, %[sum]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP_%=\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
: "r0", "r1", "r2", "r3", "r4", "r5");
#endif /* ARM_MATH_BIG_ENDIAN */
#endif /* USE_INTRINSIC */
colCnt = (dim_kernel * dim_kernel) & 0x1;
while (colCnt)
{
union arm_nnword inA, inB;
inA.word = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inB.word = arm_nn_read_q7x4(pB);
pB += ch_im_in;
sum += inA.bytes[0] * inB.bytes[0];
sum2 += inA.bytes[1] * inB.bytes[1];
sum3 += inA.bytes[2] * inB.bytes[2];
sum4 += inA.bytes[3] * inB.bytes[3];
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
rowCnt--;
}
rowCnt = ch_im_out & 0x3;
while (rowCnt)
{
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel * dim_kernel);
row_shift += 1;
while (colCnt)
{
q7_t A1 = *pA;
q7_t B1 = *pB;
pA += ch_im_in;
pB += ch_im_in;
sum += A1 * B1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
rowCnt--;
}
/* clear counter and pointers */
pBuffer = colBuffer;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i_out_y, i_out_x, i_ch_out, i_ker_x, i_ker_y;
int conv_out;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++)
{
for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
{
// for each output
conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
for (i_ker_y = 0; i_ker_y < dim_kernel; i_ker_y++)
{
for (i_ker_x = 0; i_ker_x < dim_kernel; i_ker_x++)
{
int in_row = stride * i_out_y + i_ker_y - padding;
int in_col = stride * i_out_x + i_ker_x - padding;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in && in_col < dim_im_in)
{
conv_out += Im_in[(in_row * dim_im_in + in_col) * ch_im_in + i_ch_out] *
wt[(i_ker_y * dim_kernel + i_ker_x) * ch_im_out + i_ch_out];
}
}
}
Im_out[(i_out_y * dim_im_out + i_out_x) * ch_im_out + i_ch_out] =
(q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,427 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_depthwise_separable_conv_HWC_q7_nonsquare.c
* Description: Q7 depthwise separable convolution function (non-square shape)
*
* $Date: July 20, 2021
* $Revision: V.1.1.2
*
* Target Processor: Cortex-M cores
*
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @ingroup groupNN
*/
/**
* @addtogroup NNConv
* @{
*/
/**
* @brief Q7 depthwise separable convolution function (non-square shape)
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in_x input tensor dimension x
* @param[in] dim_im_in_y input tensor dimension y
* @param[in] ch_im_in number of input tensor channels
* @param[in] wt pointer to kernel weights
* @param[in] ch_im_out number of filters, i.e., output tensor channels
* @param[in] dim_kernel_x filter kernel size x
* @param[in] dim_kernel_y filter kernel size y
* @param[in] padding_x padding sizes x
* @param[in] padding_y padding sizes y
* @param[in] stride_x convolution stride x
* @param[in] stride_y convolution stride y
* @param[in] bias pointer to bias
* @param[in] bias_shift amount of left-shift for bias
* @param[in] out_shift amount of right-shift for output
* @param[in,out] Im_out pointer to output tensor
* @param[in] dim_im_out_x output tensor dimension x
* @param[in] dim_im_out_y output tensor dimension y
* @param[in,out] bufferA pointer to buffer space for input
* @param[in,out] bufferB pointer to buffer space for output
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* This function is the version with full list of optimization tricks, but with
* some constraints:
* ch_im_in is equal to ch_im_out
*
*/
arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,
const uint16_t dim_im_in_x,
const uint16_t dim_im_in_y,
const uint16_t ch_im_in,
const q7_t *wt,
const uint16_t ch_im_out,
const uint16_t dim_kernel_x,
const uint16_t dim_kernel_y,
const uint16_t padding_x,
const uint16_t padding_y,
const uint16_t stride_x,
const uint16_t stride_y,
const q7_t *bias,
const uint16_t bias_shift,
const uint16_t out_shift,
q7_t *Im_out,
const uint16_t dim_im_out_x,
const uint16_t dim_im_out_y,
q15_t *bufferA,
q7_t *bufferB)
{
(void)bufferB;
#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
/* Run the following code for Cortex-M4 and Cortex-M7 */
/*
* Implementation:
* There are 3 nested loop here:
* Inner loop: calculate each output value with MAC instruction over an accumulator
* Mid loop: loop over different output channel
* Outer loop: loop over different output (x, y)
*
*/
int16_t i_out_y, i_out_x;
int16_t i_ker_y, i_ker_x;
q7_t *colBuffer = (q7_t *)bufferA;
q7_t *pBuffer = colBuffer;
const q7_t *pBias = bias;
q7_t *pOut = Im_out;
uint16_t rowCnt;
uint16_t row_shift;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
/* we first do im2col here */
for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y;
i_ker_y++)
{
for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x;
i_ker_x++)
{
if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
{
/* arm_fill_q7(0, pBuffer, ch_im_in); */
memset(pBuffer, 0, ch_im_in);
}
else
{
/* arm_copy_q7((q7_t *) Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer,
* ch_im_in); */
memcpy(pBuffer, (q7_t *)Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, ch_im_in);
}
pBuffer += ch_im_in;
}
}
/* we will do the computation here for each channel */
rowCnt = ch_im_out >> 2;
row_shift = 0;
pBias = bias;
while (rowCnt)
{
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel_x * dim_kernel_y) >> 1;
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
row_shift += 4;
#ifdef USE_INTRINSIC
#ifndef ARM_MATH_BIG_ENDIAN
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHTB(opB, inB1, 16);
inB1 = __PKHBT(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHTB(opB, inA1, 16);
inA1 = __PKHBT(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum3 = __SMLAD(opA, opB, sum3);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum4 = __SMLAD(opA, opB, sum4);
colCnt--;
}
#else
while (colCnt)
{
q31_t inA1, inA2, inB1, inB2, opA, opB;
inB1 = arm_nn_read_q7x4(pB);
pB += ch_im_in;
opB = arm_nn_read_q7x4(pB);
pB += ch_im_in;
inB2 = __PKHBT(opB, inB1, 16);
inB1 = __PKHTB(inB1, opB, 16);
inA1 = arm_nn_read_q7x4(pA);
pA += ch_im_in;
opB = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inA2 = __PKHBT(opB, inA1, 16);
inA1 = __PKHTB(inA1, opB, 16);
opA = __SXTB16(inA1);
opB = __SXTB16(inB1);
sum2 = __SMLAD(opA, opB, sum2);
opA = __SXTB16(__ROR(inA1, 8));
opB = __SXTB16(__ROR(inB1, 8));
sum = __SMLAD(opA, opB, sum);
opA = __SXTB16(inA2);
opB = __SXTB16(inB2);
sum4 = __SMLAD(opA, opB, sum4);
opA = __SXTB16(__ROR(inA2, 8));
opB = __SXTB16(__ROR(inB2, 8));
sum3 = __SMLAD(opA, opB, sum3);
colCnt--;
}
#endif /* ARM_MATH_BIG_ENDIAN */
#else
#ifndef ARM_MATH_BIG_ENDIAN
// r0 r1 r2 r3 r4 r5
// inA1, inA2, inB1, inB2, opA, opB
asm volatile("COL_LOOP:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhtb r3, r5, r2, ASR #16\n"
"pkhbt r2, r2, r5, LSL #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhtb r1, r5, r0, ASR #16\n"
"pkhbt r0, r0, r5, LSL #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum], r4, r5, %[sum]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
: "r0", "r1", "r2", "r3", "r4", "r5");
#else
// r0 r1 r2 r3 r4 r5
// inA1, inA2, inB1, inB2, opA, opB
asm volatile("COL_LOOP:\n"
"ldr.w r2, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"ldr.w r5, [%[pB], #0]\n"
"add.w %[pB], %[pB], %[ch_im_in]\n"
"pkhbt r3, r5, r2, LSL #16\n"
"pkhtb r2, r2, r5, ASR #16\n"
"ldr.w r0, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"ldr.w r5, [%[pA], #0]\n"
"add.w %[pA], %[pA], %[ch_im_in]\n"
"pkhbt r1, r5, r0, LSL #16\n"
"pkhtb r0, r0, r5, ASR #16\n"
"sxtb16 r4, r0\n"
"sxtb16 r5, r2\n"
"smlad %[sum2], r4, r5, %[sum2]\n"
"mov.w r4, r0, ror #8\n"
"mov.w r5, r2, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum], r4, r5, %[sum]\n"
"sxtb16 r4, r1\n"
"sxtb16 r5, r3\n"
"smlad %[sum4], r4, r5, %[sum4]\n"
"mov.w r4, r1, ror #8\n"
"mov.w r5, r3, ror #8\n"
"sxtb16 r4, r4\n"
"sxtb16 r5, r5\n"
"smlad %[sum3], r4, r5, %[sum3]\n"
"subs %[colCnt], #1\n"
"bne COL_LOOP\n"
: [ sum ] "+r"(sum),
[ sum2 ] "+r"(sum2),
[ sum3 ] "+r"(sum3),
[ sum4 ] "+r"(sum4),
[ pB ] "+r"(pB),
[ pA ] "+r"(pA)
: [ colCnt ] "r"(colCnt), [ ch_im_in ] "r"(ch_im_in)
: "r0", "r1", "r2", "r3", "r4", "r5");
#endif /*ARM_MATH_BIG_ENDIAN */
#endif /* USE_INTRINSIC */
colCnt = (dim_kernel_x * dim_kernel_y) & 0x1;
while (colCnt)
{
union arm_nnword inA, inB;
inA.word = arm_nn_read_q7x4(pA);
pA += ch_im_in;
inB.word = arm_nn_read_q7x4(pB);
pB += ch_im_in;
sum += inA.bytes[0] * inB.bytes[0];
sum2 += inA.bytes[1] * inB.bytes[1];
sum3 += inA.bytes[2] * inB.bytes[2];
sum4 += inA.bytes[3] * inB.bytes[3];
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
rowCnt--;
}
rowCnt = ch_im_out & 0x3;
while (rowCnt)
{
q7_t *pB = colBuffer + row_shift;
const q7_t *pA = wt + row_shift;
q31_t sum = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = (dim_kernel_x * dim_kernel_y);
row_shift += 1;
while (colCnt)
{
q7_t A1 = *pA;
q7_t B1 = *pB;
pA += ch_im_in;
pB += ch_im_in;
sum += A1 * B1;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
rowCnt--;
}
// clear counter and pointers
pBuffer = colBuffer;
}
}
#else
(void)bufferA;
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
int i_out_y, i_out_x, i_ch_out;
int i_ker_y, i_ker_x;
/* do some checking here, basically ch_im_in == ch_im_out */
if (ch_im_in != ch_im_out)
{
return ARM_MATH_SIZE_MISMATCH;
}
for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
{
for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
{
for (i_ch_out = 0; i_ch_out < ch_im_out; i_ch_out++)
{
// for each output
int conv_out = ((q31_t)(bias[i_ch_out]) << bias_shift) + NN_ROUND(out_shift);
for (i_ker_y = 0; i_ker_y < dim_kernel_y; i_ker_y++)
{
for (i_ker_x = 0; i_ker_x < dim_kernel_x; i_ker_x++)
{
int in_row = stride_y * i_out_y + i_ker_y - padding_y;
int in_col = stride_x * i_out_x + i_ker_x - padding_x;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
{
conv_out += Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + i_ch_out] *
wt[(i_ker_y * dim_kernel_x + i_ker_x) * ch_im_out + i_ch_out];
}
}
}
Im_out[(i_out_y * dim_im_out_x + i_out_x) * ch_im_out + i_ch_out] =
(q7_t)__SSAT((conv_out >> out_shift), 8);
}
}
}
#endif /* ARM_MATH_DSP */
/* Return to application */
return ARM_MATH_SUCCESS;
}
/**
* @} end of NNConv group
*/

View File

@@ -0,0 +1,218 @@
/*
* Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_depthwise_conv_s8_core.c
* Description: Depthwise convolution on im2col buffers.
*
* $Date: 09. October 2020
* $Revision: V.1.0.4
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnsupportfunctions.h"
/*
* Depthwise conv on an im2col buffer where the input channel equals
* output channel.
*
* Refer header file for details.
*
*/
q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
const q15_t *col,
const uint16_t num_ch,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t out_offset,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t kernel_size,
const int32_t *const output_bias,
q7_t *out)
{
#if defined(ARM_MATH_MVEI)
int32_t ch_per_loop = num_ch / 4;
const int32_t *bias = output_bias;
int8_t *out_tmp = out;
int32_t idx = 0;
while (ch_per_loop > 0)
{
int32x4_t ip_0;
int32x4_t ip_1;
int32_t ker_loop = kernel_size / 3;
int32x4_t out_0 = vldrwq_s32(bias);
int32x4_t out_1 = out_0;
bias += 4;
const int32_t offset = idx * 4;
const int8_t *row_0 = row + offset;
const int16_t *col_0 = col + offset;
const int16_t *col_1 = col + kernel_size * num_ch + offset;
int32x4_t ker_0 = vldrbq_s32(row_0);
while (ker_loop > 0)
{
const int8_t *row_1 = row_0 + num_ch;
const int8_t *row_2 = row_0 + 2 * num_ch;
const int32x4_t ker_1 = vldrbq_s32(row_1);
const int32x4_t ker_2 = vldrbq_s32(row_2);
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
col_0 += num_ch;
col_1 += num_ch;
out_0 += vmulq_s32(ip_0, ker_0);
out_1 += vmulq_s32(ip_1, ker_0);
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
col_0 += num_ch;
col_1 += num_ch;
out_0 += vmulq_s32(ip_0, ker_1);
out_1 += vmulq_s32(ip_1, ker_1);
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
col_0 += num_ch;
col_1 += num_ch;
out_0 += vmulq_s32(ip_0, ker_2);
out_1 += vmulq_s32(ip_1, ker_2);
row_0 += 3 * num_ch;
ker_0 = vldrbq_s32(row_0);
ker_loop--;
}
idx++;
/* Handle tail kernel elements */
ker_loop = kernel_size - ((kernel_size / 3) * 3);
while (ker_loop > 0)
{
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
out_0 += vmulq_s32(ip_0, ker_0);
out_1 += vmulq_s32(ip_1, ker_0);
col_0 += num_ch;
col_1 += num_ch;
ip_0 = vldrhq_s32(col_0);
ip_1 = vldrhq_s32(col_1);
row_0 += num_ch;
ker_0 = vldrbq_s32(row_0);
ker_loop--;
}
const int32x4_t mult = vldrwq_s32(out_mult);
const int32x4_t shift = vldrwq_s32(out_shift);
out_mult += 4;
out_shift += 4;
out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
out_0 = vaddq_n_s32(out_0, out_offset);
out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
vstrbq_s32(out_tmp, out_0);
out_1 = vaddq_n_s32(out_1, out_offset);
out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
vstrbq_s32(out_tmp + num_ch, out_1);
out_tmp += 4;
ch_per_loop--;
}
int32_t tail_ch = num_ch & 3;
if (tail_ch != 0)
{
int32_t ch_idx = (num_ch & ~3);
int32x4_t col_0_sum;
int32x4_t col_1_sum;
const int32_t single_buffer_size = kernel_size * num_ch;
for (int i = 0; i < tail_ch; i++)
{
const int16_t *col_pos_0 = col + ch_idx;
const int16_t *col_pos_1 = col_pos_0 + single_buffer_size;
const int8_t *row_pos = row + ch_idx;
int32_t sum_0 = bias[i];
int32_t sum_1 = bias[i];
for (int j = 0; j < kernel_size; j++)
{
const int8_t row_val = row_pos[j * num_ch];
sum_0 += row_val * col_pos_0[j * num_ch];
sum_1 += row_val * col_pos_1[j * num_ch];
}
col_0_sum[i] = sum_0;
col_1_sum[i] = sum_1;
ch_idx++;
}
const mve_pred16_t p = vctp32q((uint32_t)tail_ch);
const int32x4_t mult = vldrwq_z_s32(out_mult, p);
const int32x4_t shift = vldrwq_z_s32(out_shift, p);
col_0_sum = arm_requantize_mve_32x4(col_0_sum, mult, shift);
col_1_sum = arm_requantize_mve_32x4(col_1_sum, mult, shift);
col_0_sum = vaddq_n_s32(col_0_sum, out_offset);
col_0_sum = vmaxq_s32(col_0_sum, vdupq_n_s32(activation_min));
col_0_sum = vminq_s32(col_0_sum, vdupq_n_s32(activation_max));
vstrbq_p_s32(out_tmp, col_0_sum, p);
col_1_sum = vaddq_n_s32(col_1_sum, out_offset);
col_1_sum = vmaxq_s32(col_1_sum, vdupq_n_s32(activation_min));
col_1_sum = vminq_s32(col_1_sum, vdupq_n_s32(activation_max));
vstrbq_p_s32(out_tmp + num_ch, col_1_sum, p);
out_tmp += tail_ch;
}
return out_tmp + num_ch;
#else
(void)row;
(void)col;
(void)num_ch;
(void)out_shift;
(void)out_mult;
(void)out_offset;
(void)activation_min;
(void)activation_max;
(void)kernel_size;
(void)output_bias;
(void)out;
return NULL;
#endif
}

View File

@@ -0,0 +1,186 @@
/*
* Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_nn_mat_mult_kernel_q7_q15.c
* Description: Matrix-multiplication function for convolution
*
* $Date: January 26, 2021
* $Revision: V.1.0.2
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"
/**
* @brief Matrix-multiplication function for convolution.
*
* @details Refer to header file for details.
*
*/
q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t *pA,
const q15_t *pInBuffer,
const uint16_t ch_im_out,
const uint16_t numCol_A,
const uint16_t bias_shift,
const uint16_t out_shift,
const q7_t *bias,
q7_t *pOut)
{
#if defined(ARM_MATH_DSP)
/* set up the second output pointers */
q7_t *pOut2 = pOut + ch_im_out;
const q7_t *pBias = bias;
uint16_t rowCnt = ch_im_out >> 1;
/* this loop over rows in A */
while (rowCnt)
{
/* setup pointers for B */
const q15_t *pB = pInBuffer;
const q15_t *pB2 = pB + numCol_A;
/* align the second pointer for A */
const q7_t *pA2 = pA + numCol_A;
/* init the sum with bias */
q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
q31_t sum3 = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
q31_t sum4 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = numCol_A >> 2;
/* accumulate over the vector */
while (colCnt)
{
q31_t inA11, inA12, inA21, inA22;
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
pA = read_and_pad(pA, &inA11, &inA12);
pA2 = read_and_pad(pA2, &inA21, &inA22);
sum = __SMLAD(inA11, inB1, sum);
sum2 = __SMLAD(inA11, inB2, sum2);
sum3 = __SMLAD(inA21, inB1, sum3);
sum4 = __SMLAD(inA21, inB2, sum4);
inB1 = arm_nn_read_q15x2_ia(&pB);
inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA12, inB1, sum);
sum2 = __SMLAD(inA12, inB2, sum2);
sum3 = __SMLAD(inA22, inB1, sum3);
sum4 = __SMLAD(inA22, inB2, sum4);
colCnt--;
} /* while over colCnt */
colCnt = numCol_A & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
q7_t inA2 = *pA2++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
sum3 += inA2 * inB1;
sum4 += inA2 * inB2;
colCnt--;
} /* while over colCnt */
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pOut++ = (q7_t)__SSAT((sum3 >> out_shift), 8);
*pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
*pOut2++ = (q7_t)__SSAT((sum4 >> out_shift), 8);
/* skip the row computed with A2 */
pA += numCol_A;
rowCnt--;
} /* for over ch_im_out */
/* compute left-over row if any */
if (ch_im_out & 0x1)
{
/* setup pointers for B */
const q15_t *pB = pInBuffer;
const q15_t *pB2 = pB + numCol_A;
/* load the bias */
q31_t sum = ((q31_t)(*pBias) << bias_shift) + NN_ROUND(out_shift);
q31_t sum2 = ((q31_t)(*pBias++) << bias_shift) + NN_ROUND(out_shift);
uint16_t colCnt = numCol_A >> 2;
while (colCnt)
{
q31_t inA11, inA12;
q31_t inB1 = arm_nn_read_q15x2_ia(&pB);
q31_t inB2 = arm_nn_read_q15x2_ia(&pB2);
pA = read_and_pad(pA, &inA11, &inA12);
sum = __SMLAD(inA11, inB1, sum);
sum2 = __SMLAD(inA11, inB2, sum2);
inB1 = arm_nn_read_q15x2_ia(&pB);
inB2 = arm_nn_read_q15x2_ia(&pB2);
sum = __SMLAD(inA12, inB1, sum);
sum2 = __SMLAD(inA12, inB2, sum2);
colCnt--;
}
colCnt = numCol_A & 0x3;
while (colCnt)
{
q7_t inA1 = *pA++;
q15_t inB1 = *pB++;
q15_t inB2 = *pB2++;
sum += inA1 * inB1;
sum2 += inA1 * inB2;
colCnt--;
}
*pOut++ = (q7_t)__SSAT((sum >> out_shift), 8);
*pOut2++ = (q7_t)__SSAT((sum2 >> out_shift), 8);
}
pOut += ch_im_out;
/* return the new output pointer with offset */
return pOut;
#else
(void)pA;
(void)pInBuffer;
(void)ch_im_out;
(void)numCol_A;
(void)bias_shift;
(void)out_shift;
(void)bias;
(void)pOut;
/* To be completed */
return NULL;
#endif /* ARM_MATH_DSP */
}

Some files were not shown because too many files have changed in this diff Show More