инициализация spi перенесена на cube (работает)

убрана кастомная библиотека periph_general
This commit is contained in:
2025-06-27 15:28:58 +03:00
parent 981dbf9bfa
commit a9a2466359
1587 changed files with 403396 additions and 250770 deletions

View File

@@ -0,0 +1,87 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: BasicMathFunctions.c
* Description: Combination of all basic math function source files.
*
* $Date: 16. March 2020
* $Revision: V1.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_abs_f32.c"
#include "arm_abs_f64.c"
#include "arm_abs_q15.c"
#include "arm_abs_q31.c"
#include "arm_abs_q7.c"
#include "arm_add_f32.c"
#include "arm_add_f64.c"
#include "arm_add_q15.c"
#include "arm_add_q31.c"
#include "arm_add_q7.c"
#include "arm_and_u16.c"
#include "arm_and_u32.c"
#include "arm_and_u8.c"
#include "arm_dot_prod_f32.c"
#include "arm_dot_prod_f64.c"
#include "arm_dot_prod_q15.c"
#include "arm_dot_prod_q31.c"
#include "arm_dot_prod_q7.c"
#include "arm_mult_f32.c"
#include "arm_mult_f64.c"
#include "arm_mult_q15.c"
#include "arm_mult_q31.c"
#include "arm_mult_q7.c"
#include "arm_negate_f32.c"
#include "arm_negate_f64.c"
#include "arm_negate_q15.c"
#include "arm_negate_q31.c"
#include "arm_negate_q7.c"
#include "arm_not_u16.c"
#include "arm_not_u32.c"
#include "arm_not_u8.c"
#include "arm_offset_f32.c"
#include "arm_offset_f64.c"
#include "arm_offset_q15.c"
#include "arm_offset_q31.c"
#include "arm_offset_q7.c"
#include "arm_or_u16.c"
#include "arm_or_u32.c"
#include "arm_or_u8.c"
#include "arm_scale_f32.c"
#include "arm_scale_f64.c"
#include "arm_scale_q15.c"
#include "arm_scale_q31.c"
#include "arm_scale_q7.c"
#include "arm_shift_q15.c"
#include "arm_shift_q31.c"
#include "arm_shift_q7.c"
#include "arm_sub_f32.c"
#include "arm_sub_f64.c"
#include "arm_sub_q15.c"
#include "arm_sub_q31.c"
#include "arm_sub_q7.c"
#include "arm_xor_u16.c"
#include "arm_xor_u32.c"
#include "arm_xor_u8.c"
#include "arm_clip_f32.c"
#include "arm_clip_q31.c"
#include "arm_clip_q15.c"
#include "arm_clip_q7.c"

View File

@@ -0,0 +1,37 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: BasicMathFunctionsF16.c
* Description: Combination of all basic math function f16 source files.
*
* $Date: 20. April 2020
* $Revision: V1.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_abs_f16.c"
#include "arm_add_f16.c"
#include "arm_dot_prod_f16.c"
#include "arm_mult_f16.c"
#include "arm_negate_f16.c"
#include "arm_offset_f16.c"
#include "arm_scale_f16.c"
#include "arm_sub_f16.c"
#include "arm_clip_f16.c"

View File

@@ -0,0 +1,41 @@
cmake_minimum_required (VERSION 3.14)
project(CMSISDSPBasicMath)
include(configLib)
include(configDsp)
file(GLOB SRCF64 "./*_f64.c")
file(GLOB SRCF32 "./*_f32.c")
file(GLOB SRCF16 "./*_f16.c")
file(GLOB SRCQ31 "./*_q31.c")
file(GLOB SRCQ15 "./*_q15.c")
file(GLOB SRCQ7 "./*_q7.c")
file(GLOB SRCU32 "./*_u32.c")
file(GLOB SRCU16 "./*_u16.c")
file(GLOB SRCU8 "./*_u8.c")
add_library(CMSISDSPBasicMath STATIC ${SRCF64})
target_sources(CMSISDSPBasicMath PRIVATE ${SRCF32})
if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
target_sources(CMSISDSPBasicMath PRIVATE ${SRCF16})
endif()
target_sources(CMSISDSPBasicMath PRIVATE ${SRCQ31})
target_sources(CMSISDSPBasicMath PRIVATE ${SRCQ15})
target_sources(CMSISDSPBasicMath PRIVATE ${SRCQ7})
target_sources(CMSISDSPBasicMath PRIVATE ${SRCU32})
target_sources(CMSISDSPBasicMath PRIVATE ${SRCU16})
target_sources(CMSISDSPBasicMath PRIVATE ${SRCU8})
configLib(CMSISDSPBasicMath ${ROOT})
configDsp(CMSISDSPBasicMath ${ROOT})
### Includes
target_include_directories(CMSISDSPBasicMath PUBLIC "${DSP}/Include")

View File

@@ -0,0 +1,198 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_abs_f16.c
* Description: Floating-point vector absolute value
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
#include <math.h>
/**
@ingroup groupMath
*/
/**
@defgroup BasicAbs Vector Absolute Value
Computes the absolute value of a vector on an element-by-element basis.
<pre>
pDst[n] = abs(pSrc[n]), 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicAbs
@{
*/
/**
@brief Floating-point vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_abs_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vabsq(vec1);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
/* C = |A| */
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q(pSrc);
vstrhq_p(pDst, vabsq(vec1), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_abs_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
f16x8_t vec1;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q_f16(pSrc);
res = vabsq_f16(vec1);
vst1q_f16(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and store result in destination buffer. */
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and store result in destination buffer. */
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_FLOAT16_SUPPORTED */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicAbs group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_abs_f32.c
* Description: Floating-point vector absolute value
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,128 +26,171 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
#include <math.h>
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup BasicAbs Vector Absolute Value
*
* Computes the absolute value of a vector on an element-by-element basis.
*
* <pre>
* pDst[n] = abs(pSrc[n]), 0 <= n < blockSize.
* </pre>
*
* The functions support in-place computation allowing the source and
* destination pointers to reference the same memory buffer.
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
@defgroup BasicAbs Vector Absolute Value
Computes the absolute value of a vector on an element-by-element basis.
<pre>
pDst[n] = abs(pSrc[n]), 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup BasicAbs
* @{
@addtogroup BasicAbs
@{
*/
/**
* @brief Floating-point vector absolute value.
* @param[in] *pSrc points to the input buffer
* @param[out] *pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
* @return none.
@brief Floating-point vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_abs_f32(
float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
f32x4_t vec1;
f32x4_t res;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t in1, in2, in3, in4; /* temporary variables */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/*loop Unrolling */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vabsq(vec1);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
if (blkCnt > 0U)
{
/* C = |A| */
mve_pred16_t p0 = vctp32q(blkCnt);
vec1 = vld1q(pSrc);
vstrwq_p(pDst, vabsq(vec1), p0);
}
}
#else
void arm_abs_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
f32x4_t vec1;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vabsq_f32(vec1);
vst1q_f32(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and then store the results in the destination buffer. */
/* read sample from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
/* find absolute value */
in1 = fabsf(in1);
/* Calculate absolute and store result in destination buffer. */
*pDst++ = fabsf(*pSrc++);
/* read sample from source */
in4 = *(pSrc + 3);
*pDst++ = fabsf(*pSrc++);
/* find absolute value */
in2 = fabsf(in2);
*pDst++ = fabsf(*pSrc++);
/* read sample from source */
*pDst = in1;
*pDst++ = fabsf(*pSrc++);
/* find absolute value */
in3 = fabsf(in3);
/* find absolute value */
in4 = fabsf(in4);
/* store result to destination */
*(pDst + 1) = in2;
/* store result to destination */
*(pDst + 2) = in3;
/* store result to destination */
*(pDst + 3) = in4;
/* Update source pointer to process next sampels */
pSrc += 4U;
/* Update destination pointer to process next sampels */
pDst += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and then store the results in the destination buffer. */
/* Calculate absolute and store result in destination buffer. */
*pDst++ = fabsf(*pSrc++);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of BasicAbs group
@} end of BasicAbs group
*/

View File

@@ -0,0 +1,74 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_abs_f64.c
* Description: Floating-point vector absolute value
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
#include <math.h>
/**
@ingroup groupMath
*/
/**
@addtogroup BasicAbs
@{
*/
/**
@brief Floating-point vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_abs_f64(
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and store result in destination buffer. */
*pDst++ = fabs(*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicAbs group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_abs_q15.c
* Description: Q15 vector absolute value
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,142 +26,153 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicAbs
* @{
@addtogroup BasicAbs
@{
*/
/**
* @brief Q15 vector absolute value.
* @param[in] *pSrc points to the input buffer
* @param[out] *pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
@brief Q15 vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_abs_q15(
q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
#if defined (ARM_MATH_DSP)
__SIMD32_TYPE *simd;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = |A|
* Calculate absolute and then store the results in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqabsq(vecSrc));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 8;
pDst += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrc = vld1q(pSrc);
vstrhq_p(pDst, vqabsq(vecSrc), p0);
}
}
/* Run the below code for Cortex-M4 and Cortex-M3 */
#else
void arm_abs_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q15_t in; /* Temporary input variable */
q15_t in1; /* Input value1 */
q15_t in2; /* Input value2 */
#if defined (ARM_MATH_LOOPUNROLL)
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
simd = __SIMD32_CONST(pDst);
while (blkCnt > 0U)
{
/* C = |A| */
/* Read two inputs */
in1 = *pSrc++;
in2 = *pSrc++;
/* Store the Absolute result in the destination buffer by packing the two values, in a single cycle */
#ifndef ARM_MATH_BIG_ENDIAN
*simd++ =
__PKHBT(((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)),
((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)), 16);
/* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
*simd++ =
__PKHBT(((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)),
((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
in1 = *pSrc++;
in2 = *pSrc++;
#ifndef ARM_MATH_BIG_ENDIAN
*simd++ =
__PKHBT(((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)),
((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)), 16);
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
*simd++ =
__PKHBT(((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)),
((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)), 16);
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
pDst = (q15_t *)simd;
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Read the input */
in1 = *pSrc++;
/* Calculate absolute value of input and then store the result in the destination buffer. */
*pDst++ = (in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
q15_t in; /* Temporary input variable */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Read the input */
/* Calculate absolute of input (if -1 then saturated to 0x7fff) and store result in destination buffer. */
in = *pSrc++;
/* Calculate absolute value of input and then store the result in the destination buffer. */
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q15_t)__QSUB16(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
#endif
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of BasicAbs group
@} end of BasicAbs group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_abs_q31.c
* Description: Q31 vector absolute value
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,93 +26,183 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicAbs
* @{
@addtogroup BasicAbs
@{
*/
/**
* @brief Q31 vector absolute value.
* @param[in] *pSrc points to the input buffer
* @param[out] *pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
@brief Q31 vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_abs_q31(
q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
q31_t in; /* Input value */
uint32_t blkCnt; /* Loop counters */
q31x4_t vecSrc;
#if defined (ARM_MATH_DSP)
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2, in3, in4;
while (blkCnt > 0U)
{
/*
* C = |A|
* Calculate absolute and then store the results in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqabsq(vecSrc));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* Advance vector source and destination pointers
*/
pSrc += 4;
pDst += 4;
}
/*
* Tail
*/
blkCnt = blockSize & 3;
/*loop Unrolling */
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrc = vld1q(pSrc);
vstrwq_p(pDst, vqabsq(vecSrc), p0);
}
}
#else
void arm_abs_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q31_t in; /* Temporary variable */
#if defined(ARM_MATH_NEON)
int32x4_t vec1;
int32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and then store the results in the destination buffer. */
vec1 = vld1q_s32(pSrc);
res = vqabsq_s32(vec1);
vst1q_s32(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the blockSize loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
*pDst++ = (in1 > 0) ? in1 : (q31_t)__QSUB(0, in1);
*pDst++ = (in2 > 0) ? in2 : (q31_t)__QSUB(0, in2);
*pDst++ = (in3 > 0) ? in3 : (q31_t)__QSUB(0, in3);
*pDst++ = (in4 > 0) ? in4 : (q31_t)__QSUB(0, in4);
/* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
/* Decrement the loop counter */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined (ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute value of the input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
in = *pSrc++;
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
/* Decrement the loop counter */
/* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q31_t)__QSUB(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* #if defined (ARM_MATH_MVEI) */
/**
* @} end of BasicAbs group
@} end of BasicAbs group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_abs_q7.c
* Description: Q7 vector absolute value
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,120 +26,155 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicAbs
* @{
@addtogroup BasicAbs
@{
*/
/**
* @brief Q7 vector absolute value.
* @param[in] *pSrc points to the input buffer
* @param[out] *pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
* @return none.
*
* \par Conditions for optimum performance
* Input and output buffers should be aligned by 32-bit
*
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
@brief Q7 vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Conditions for optimum performance
Input and output buffers should be aligned by 32-bit
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_abs_q7(
q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
q7_t in; /* Input value1 */
uint32_t blkCnt; /* loop counters */
q7x16_t vecSrc;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2, in3, in4; /* temporary input variables */
q31_t out1, out2, out3, out4; /* temporary output variables */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = |A| */
/* Read inputs */
in1 = (q31_t) * pSrc;
in2 = (q31_t) * (pSrc + 1);
in3 = (q31_t) * (pSrc + 2);
/* find absolute value */
out1 = (in1 > 0) ? in1 : (q31_t)__QSUB8(0, in1);
/* read input */
in4 = (q31_t) * (pSrc + 3);
/* find absolute value */
out2 = (in2 > 0) ? in2 : (q31_t)__QSUB8(0, in2);
/* store result to destination */
*pDst = (q7_t) out1;
/* find absolute value */
out3 = (in3 > 0) ? in3 : (q31_t)__QSUB8(0, in3);
/* find absolute value */
out4 = (in4 > 0) ? in4 : (q31_t)__QSUB8(0, in4);
/* store result to destination */
*(pDst + 1) = (q7_t) out2;
/* store result to destination */
*(pDst + 2) = (q7_t) out3;
/* store result to destination */
*(pDst + 3) = (q7_t) out4;
/* update pointers to process next samples */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
blkCnt = blockSize;
#endif /* #define ARM_MATH_CM0_FAMILY */
while (blkCnt > 0U)
{
/* C = |A| */
/* Read the input */
in = *pSrc++;
/* Store the Absolute result in the destination buffer */
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? 0x7f : -in);
/* Decrement the loop counter */
blkCnt--;
}
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = |A|
* Calculate absolute and then store the results in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqabsq(vecSrc));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 16;
pDst += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrc = vld1q(pSrc);
vstrbq_p(pDst, vqabsq(vecSrc), p0);
}
}
#else
void arm_abs_q7(
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q7_t in; /* Temporary input variable */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t)__QSUB8(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute of input (if -1 then saturated to 0x7f) and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (in > 0) ? in : (q7_t) __QSUB8(0, in);
#else
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? (q7_t) 0x7f : -in);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of BasicAbs group
@} end of BasicAbs group
*/

View File

@@ -0,0 +1,169 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_f16.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicAdd Vector Addition
Element-by-element addition of two vectors.
<pre>
pDst[n] = pSrcA[n] + pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicAdd
@{
*/
/**
@brief Floating-point vector addition.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_add_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t vec2;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
res = vaddq(vec1, vec2);
vst1q(pDst, res);
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
/* C = A + B */
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
vstrhq_p(pDst, vaddq(vec1,vec2), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_add_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_FLOAT16_SUPPORTED) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicAdd group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_add_f32.c
* Description: Floating-point vector addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,113 +26,174 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup BasicAdd Vector Addition
*
* Element-by-element addition of two vectors.
*
* <pre>
* pDst[n] = pSrcA[n] + pSrcB[n], 0 <= n < blockSize.
* </pre>
*
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
@defgroup BasicAdd Vector Addition
Element-by-element addition of two vectors.
<pre>
pDst[n] = pSrcA[n] + pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup BasicAdd
* @{
@addtogroup BasicAdd
@{
*/
/**
* @brief Floating-point vector addition.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
@brief Floating-point vector addition.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_add_f32(
float32_t * pSrcA,
float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
f32x4_t vec1;
f32x4_t vec2;
f32x4_t res;
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t inA1, inA2, inA3, inA4; /* temporary input variabels */
float32_t inB1, inB2, inB3, inB4; /* temporary input variables */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/*loop Unrolling */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
res = vaddq(vec1, vec2);
vst1q(pDst, res);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
if (blkCnt > 0U)
{
/* C = A + B */
mve_pred16_t p0 = vctp32q(blkCnt);
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
vstrwq_p(pDst, vaddq(vec1,vec2), p0);
}
}
#else
void arm_add_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
f32x4_t vec1;
f32x4_t vec2;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
res = vaddq_f32(vec1, vec2);
vst1q_f32(pDst, res);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
/* read four inputs from sourceA and four inputs from sourceB */
inA1 = *pSrcA;
inB1 = *pSrcB;
inA2 = *(pSrcA + 1);
inB2 = *(pSrcB + 1);
inA3 = *(pSrcA + 2);
inB3 = *(pSrcB + 2);
inA4 = *(pSrcA + 3);
inB4 = *(pSrcB + 3);
/* Add and store result in destination buffer. */
*pDst++ = (*pSrcA++) + (*pSrcB++);
*pDst++ = (*pSrcA++) + (*pSrcB++);
*pDst++ = (*pSrcA++) + (*pSrcB++);
*pDst++ = (*pSrcA++) + (*pSrcB++);
/* C = A + B */
/* add and store result to destination */
*pDst = inA1 + inB1;
*(pDst + 1) = inA2 + inB2;
*(pDst + 2) = inA3 + inB3;
*(pDst + 3) = inA4 + inB4;
/* update pointers to process next samples */
pSrcA += 4U;
pSrcB += 4U;
pDst += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
/* Add and store result in destination buffer. */
*pDst++ = (*pSrcA++) + (*pSrcB++);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of BasicAdd group
@} end of BasicAdd group
*/

View File

@@ -0,0 +1,75 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_f64.c
* Description: Floating-point vector addition
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicAdd
@{
*/
/**
@brief Floating-point vector addition.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_add_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = (*pSrcA++) + (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicAdd group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_add_q15.c
* Description: Q15 vector addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,103 +26,151 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicAdd
* @{
@addtogroup BasicAdd
@{
*/
/**
* @brief Q15 vector addition.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
@brief Q15 vector addition.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_add_q15(
q15_t * pSrcA,
q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* loop counters */
q15x8_t vecA;
q15x8_t vecB;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inB1, inB2;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
inA1 = *__SIMD32(pSrcA)++;
inA2 = *__SIMD32(pSrcA)++;
inB1 = *__SIMD32(pSrcB)++;
inB2 = *__SIMD32(pSrcB)++;
*__SIMD32(pDst)++ = __QADD16(inA1, inB1);
*__SIMD32(pDst)++ = __QADD16(inA2, inB2);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = A + B
* Add and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqaddq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
pDst += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrhq_p(pDst, vqaddq(vecA, vecB), p0);
}
}
#else
void arm_add_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Run the below code for Cortex-M0 */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t inA1, inA2;
q31_t inB1, inB2;
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
#if defined (ARM_MATH_DSP)
/* read 2 times 2 samples at a time from sourceA */
inA1 = read_q15x2_ia (&pSrcA);
inA2 = read_q15x2_ia (&pSrcA);
/* read 2 times 2 samples at a time from sourceB */
inB1 = read_q15x2_ia (&pSrcB);
inB2 = read_q15x2_ia (&pSrcB);
/* Add and store 2 times 2 samples at a time */
write_q15x2_ia (&pDst, __QADD16(inA1, inB1));
write_q15x2_ia (&pDst, __QADD16(inA2, inB2));
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ + *pSrcB++), 16);
/* Decrement the loop counter */
/* Add and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++);
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrcA++ + *pSrcB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of BasicAdd group
@} end of BasicAdd group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_add_q31.c
* Description: Q31 vector addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,111 +26,134 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicAdd
* @{
@addtogroup BasicAdd
@{
*/
/**
* @brief Q31 vector addition.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
@brief Q31 vector addition.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_add_q31(
q31_t * pSrcA,
q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt;
q31x4_t vecA;
q31x4_t vecB;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inA3, inA4;
q31_t inB1, inB2, inB3, inB4;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
inA1 = *pSrcA++;
inA2 = *pSrcA++;
inB1 = *pSrcB++;
inB2 = *pSrcB++;
inA3 = *pSrcA++;
inA4 = *pSrcA++;
inB3 = *pSrcB++;
inB4 = *pSrcB++;
*pDst++ = __QADD(inA1, inB1);
*pDst++ = __QADD(inA2, inB2);
*pDst++ = __QADD(inA3, inB3);
*pDst++ = __QADD(inA4, inB4);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* C = A + B
* Add and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqaddq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
pDst += 4;
}
/*
* tail
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrwq_p(pDst, vqaddq(vecA, vecB), p0);
}
}
#else
void arm_add_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Run the below code for Cortex-M0 */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrcA++ + *pSrcB++);
/* Decrement the loop counter */
/* Add and store result in destination buffer. */
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of BasicAdd group
@} end of BasicAdd group
*/

View File

@@ -1,15 +1,5 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_q7.c
* Description: Q7 vector addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,97 +16,144 @@
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup BasicAdd
* @{
*/
/**
* @brief Q7 vector addition.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_q7.c
* Description: Q7 vector addition
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicAdd
@{
*/
/**
@brief Q7 vector addition.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_add_q7(
q7_t * pSrcA,
q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* loop counters */
q7x16_t vecA;
q7x16_t vecB;
#if defined (ARM_MATH_DSP)
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = A + B
* Add and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqaddq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 16;
pSrcB += 16;
pDst += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrbq_p(pDst, vqaddq(vecA, vecB), p0);
}
}
#else
void arm_add_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Run the below code for Cortex-M4 and Cortex-M3 */
#if defined (ARM_MATH_LOOPUNROLL)
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*__SIMD32(pDst)++ = __QADD8(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);
/* Decrement the loop counter */
#if defined (ARM_MATH_DSP)
/* Add and store result in destination buffer (4 samples at a time). */
write_q7x4_ia (&pDst, __QADD8 (read_q7x4_ia (&pSrcA), read_q7x4_ia (&pSrcB)));
#else
*pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
*pDst++ = (q7_t) __SSAT ((q15_t) *pSrcA++ + *pSrcB++, 8);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q7_t) __SSAT(*pSrcA++ + *pSrcB++, 8);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) * pSrcA++ + *pSrcB++, 8);
/* Decrement the loop counter */
/* Add and store result in destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) *pSrcA++ + *pSrcB++, 8);
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of BasicAdd group
@} end of BasicAdd group
*/

View File

@@ -0,0 +1,137 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_and_u16.c
* Description: uint16_t bitwise AND
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup And Vector bitwise AND
Compute the logical bitwise AND.
There are separate functions for uint32_t, uint16_t, and uint7_t data types.
*/
/**
@addtogroup And
@{
*/
/**
@brief Compute the logical bitwise AND of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_and_u16(
const uint16_t * pSrcA,
const uint16_t * pSrcB,
uint16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecSrcA, vecSrcB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vandq_u16(vecSrcA, vecSrcB) );
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrhq_p(pDst, vandq_u16(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecA, vecB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
vecA = vld1q_u16(pSrcA);
vecB = vld1q_u16(pSrcB);
vst1q_u16(pDst, vandq_u16(vecA, vecB) );
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)&(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of And group
*/

View File

@@ -0,0 +1,129 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_and_u32.c
* Description: uint32_t bitwise AND
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup And
@{
*/
/**
@brief Compute the logical bitwise AND of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_and_u32(
const uint32_t * pSrcA,
const uint32_t * pSrcB,
uint32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecSrcA, vecSrcB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vandq_u32(vecSrcA, vecSrcB) );
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrwq_p(pDst, vandq_u32(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecA, vecB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecA = vld1q_u32(pSrcA);
vecB = vld1q_u32(pSrcB);
vst1q_u32(pDst, vandq_u32(vecA, vecB) );
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)&(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of And group
*/

View File

@@ -0,0 +1,130 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_and_u8.c
* Description: uint8_t bitwise AND
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup And
@{
*/
/**
@brief Compute the logical bitwise AND of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_and_u8(
const uint8_t * pSrcA,
const uint8_t * pSrcB,
uint8_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecSrcA, vecSrcB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vandq_u8(vecSrcA, vecSrcB) );
pSrcA += 16;
pSrcB += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrbq_p(pDst, vandq_u8(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecA, vecB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4U;
while (blkCnt > 0U)
{
vecA = vld1q_u8(pSrcA);
vecB = vld1q_u8(pSrcB);
vst1q_u8(pDst, vandq_u8(vecA, vecB) );
pSrcA += 16;
pSrcB += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)&(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of And group
*/

View File

@@ -0,0 +1,141 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_f16.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise floating-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_f16(const float16_t * pSrc,
float16_t * pDst,
float16_t low,
float16_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
f16x8_t curVec0, curVec1;
f16x8_t vecLow, vecHigh;
vecLow = vdupq_n_f16(low);
vecHigh = vdupq_n_f16(high);
curVec0 = vld1q(pSrc);
pSrc += 8;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 4;
while (blkCnt--)
{
curVec0 = vmaxnmq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 8;
curVec0 = vminnmq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 8;
curVec1 = vmaxnmq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 8;
curVec1 = vminnmq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 8;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 4) << 4);
if (blkCnt >= 8)
{
curVec0 = vmaxnmq(curVec0, vecLow);
curVec0 = vminnmq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 8;
curVec0 = vld1q(pSrc);
pSrc += 8;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp16q(blkCnt & 7);
curVec0 = vmaxnmq(curVec0, vecLow);
curVec0 = vminnmq(curVec0, vecHigh);
vstrhq_p(pDst, curVec0, p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_clip_f16(const float16_t * pSrc,
float16_t * pDst,
float16_t low,
float16_t high,
uint32_t numSamples)
{
for (uint32_t i = 0; i < numSamples; i++)
{
if ((_Float16)pSrc[i] > (_Float16)high)
pDst[i] = high;
else if ((_Float16)pSrc[i] < (_Float16)low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_FLOAT16_SUPPORTED */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,144 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_f32.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicClip Elementwise clipping
Element-by-element clipping of a value.
The value is constrained between 2 bounds.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise floating-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_f32(const float32_t * pSrc,
float32_t * pDst,
float32_t low,
float32_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
f32x4_t curVec0, curVec1;
f32x4_t vecLow, vecHigh;
vecLow = vdupq_n_f32(low);
vecHigh = vdupq_n_f32(high);
curVec0 = vld1q(pSrc);
pSrc += 4;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 3;
while (blkCnt--)
{
curVec0 = vmaxnmq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 4;
curVec0 = vminnmq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 4;
curVec1 = vmaxnmq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 4;
curVec1 = vminnmq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 4;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 3) << 3);
if (blkCnt >= 4)
{
curVec0 = vmaxnmq(curVec0, vecLow);
curVec0 = vminnmq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 4;
curVec0 = vld1q(pSrc);
pSrc += 4;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt & 3);
curVec0 = vmaxnmq(curVec0, vecLow);
curVec0 = vminnmq(curVec0, vecHigh);
vstrwq_p(pDst, curVec0, p0);
}
}
#else
void arm_clip_f32(const float32_t * pSrc,
float32_t * pDst,
float32_t low,
float32_t high,
uint32_t numSamples)
{
uint32_t i;
for (i = 0; i < numSamples; i++)
{
if (pSrc[i] > high)
pDst[i] = high;
else if (pSrc[i] < low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,134 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_q15.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise fixed-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_q15(const q15_t * pSrc,
q15_t * pDst,
q15_t low,
q15_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
q15x8_t curVec0, curVec1;
q15x8_t vecLow, vecHigh;
vecLow = vdupq_n_s16(low);
vecHigh = vdupq_n_s16(high);
curVec0 = vld1q(pSrc);
pSrc += 8;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 4;
while (blkCnt--)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 8;
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 8;
curVec1 = vmaxq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 8;
curVec1 = vminq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 8;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 4) << 4);
if (blkCnt >= 8)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 8;
curVec0 = vld1q(pSrc);
pSrc += 8;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp16q(blkCnt & 7);
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vstrhq_p(pDst, curVec0, p0);
}
}
#else
void arm_clip_q15(const q15_t * pSrc,
q15_t * pDst,
q15_t low,
q15_t high,
uint32_t numSamples)
{
uint32_t i;
for (i = 0; i < numSamples; i++)
{
if (pSrc[i] > high)
pDst[i] = high;
else if (pSrc[i] < low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,134 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_q31.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise fixed-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_q31(const q31_t * pSrc,
q31_t * pDst,
q31_t low,
q31_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
q31x4_t curVec0, curVec1;
q31x4_t vecLow, vecHigh;
vecLow = vdupq_n_s32(low);
vecHigh = vdupq_n_s32(high);
curVec0 = vld1q(pSrc);
pSrc += 4;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 3;
while (blkCnt--)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 4;
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 4;
curVec1 = vmaxq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 4;
curVec1 = vminq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 4;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 3) << 3);
if (blkCnt >= 4)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 4;
curVec0 = vld1q(pSrc);
pSrc += 4;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt & 3);
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vstrwq_p(pDst, curVec0, p0);
}
}
#else
void arm_clip_q31(const q31_t * pSrc,
q31_t * pDst,
q31_t low,
q31_t high,
uint32_t numSamples)
{
uint32_t i;
for (i = 0; i < numSamples; i++)
{
if (pSrc[i] > high)
pDst[i] = high;
else if (pSrc[i] < low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,134 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_q7.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise fixed-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_q7(const q7_t * pSrc,
q7_t * pDst,
q7_t low,
q7_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
q7x16_t curVec0, curVec1;
q7x16_t vecLow, vecHigh;
vecLow = vdupq_n_s8(low);
vecHigh = vdupq_n_s8(high);
curVec0 = vld1q(pSrc);
pSrc += 16;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 5;
while (blkCnt--)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 16;
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 16;
curVec1 = vmaxq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 16;
curVec1 = vminq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 16;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 5) << 5);
if (blkCnt >= 16)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 16;
curVec0 = vld1q(pSrc);
pSrc += 16;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp8q(blkCnt & 0xf);
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vstrbq_p(pDst, curVec0, p0);
}
}
#else
void arm_clip_q7(const q7_t * pSrc,
q7_t * pDst,
q7_t low,
q7_t high,
uint32_t numSamples)
{
uint32_t i;
for (i = 0; i < numSamples; i++)
{
if (pSrc[i] > high)
pDst[i] = high;
else if (pSrc[i] < low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,184 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_dot_prod_f16.c
* Description: Floating-point dot product
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicDotProd Vector Dot Product
Computes the dot product of two vectors.
The vectors are multiplied element-by-element and then summed.
<pre>
sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicDotProd
@{
*/
/**
@brief Dot product of floating-point vectors.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[in] blockSize number of samples in each vector.
@param[out] result output result returned here.
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_dot_prod_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
uint32_t blockSize,
float16_t * result)
{
f16x8_t vecA, vecB;
f16x8_t vecSum;
uint32_t blkCnt;
float16_t sum = 0.0f;
vecSum = vdupq_n_f16(0.0f);
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/*
* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
* Calculate dot product and then store the result in a temporary buffer.
* and advance vector source and destination pointers
*/
vecA = vld1q(pSrcA);
pSrcA += 8;
vecB = vld1q(pSrcB);
pSrcB += 8;
vecSum = vfmaq(vecSum, vecA, vecB);
/*
* Decrement the blockSize loop counter
*/
blkCnt --;
}
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vecSum = vfmaq_m(vecSum, vecA, vecB, p0);
}
sum = vecAddAcrossF16Mve(vecSum);
/* Store result in destination buffer */
*result = sum;
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_dot_prod_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
uint32_t blockSize,
float16_t * result)
{
uint32_t blkCnt; /* Loop counter */
_Float16 sum = 0.0f; /* Temporary return variable */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer */
*result = sum;
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicDotProd group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_dot_prod_f32.c
* Description: Floating-point dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 05 October 2021
* $Revision: V1.9.1
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,98 +26,203 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup dot_prod Vector Dot Product
*
* Computes the dot product of two vectors.
* The vectors are multiplied element-by-element and then summed.
*
* <pre>
* sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
* </pre>
*
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
@defgroup BasicDotProd Vector Dot Product
Computes the dot product of two vectors.
The vectors are multiplied element-by-element and then summed.
<pre>
sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup dot_prod
* @{
@addtogroup BasicDotProd
@{
*/
/**
* @brief Dot product of floating-point vectors.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] *result output result returned here
* @return none.
@brief Dot product of floating-point vectors.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[in] blockSize number of samples in each vector.
@param[out] result output result returned here.
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_dot_prod_f32(
float32_t * pSrcA,
float32_t * pSrcB,
uint32_t blockSize,
float32_t * result)
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t blockSize,
float32_t * result)
{
float32_t sum = 0.0f; /* Temporary result storage */
uint32_t blkCnt; /* loop counter */
f32x4_t vecA, vecB;
f32x4_t vecSum;
uint32_t blkCnt;
float32_t sum = 0.0f;
vecSum = vdupq_n_f32(0.0f);
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/*
* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
* Calculate dot product and then store the result in a temporary buffer.
* and advance vector source and destination pointers
*/
vecA = vld1q(pSrcA);
pSrcA += 4;
vecB = vld1q(pSrcB);
pSrcB += 4;
vecSum = vfmaq(vecSum, vecA, vecB);
/*
* Decrement the blockSize loop counter
*/
blkCnt --;
}
#if defined (ARM_MATH_DSP)
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Run the below code for Cortex-M4 and Cortex-M3 */
/*loop Unrolling */
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vecSum = vfmaq_m(vecSum, vecA, vecB, p0);
}
sum = vecAddAcrossF32Mve(vecSum);
/* Store result in destination buffer */
*result = sum;
}
#else
void arm_dot_prod_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t blockSize,
float32_t * result)
{
uint32_t blkCnt; /* Loop counter */
float32_t sum = 0.0f; /* Temporary return variable */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
f32x4_t vec1;
f32x4_t vec2;
f32x4_t accum = vdupq_n_f32(0);
#if !defined(__aarch64__)
f32x2_t tmp = vdup_n_f32(0);
#endif
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
while (blkCnt > 0U)
{
/* C = A[0]*B[0] + A[1]*B[1] + A[2]*B[2] + ... + A[blockSize-1]*B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
accum = vmlaq_f32(accum, vec1, vec2);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
/* Decrement the loop counter */
blkCnt--;
}
#if defined(__aarch64__)
sum = vpadds_f32(vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)));
#else
tmp = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
#endif
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer */
sum += (*pSrcA++) * (*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++);
/* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement the loop counter */
sum += (*pSrcA++) * (*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
/* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* Store the result back in the destination buffer */
/* Store result in destination buffer */
*result = sum;
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of dot_prod group
@} end of BasicDotProd group
*/

View File

@@ -0,0 +1,78 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_dot_prod_f64.c
* Description: Floating-point dot product
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicDotProd
@{
*/
/**
@brief Dot product of floating-point vectors.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[in] blockSize number of samples in each vector.
@param[out] result output result returned here.
@return none
*/
void arm_dot_prod_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
uint32_t blockSize,
float64_t * result)
{
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary return variable */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer */
*result = sum;
}
/**
@} end of BasicDotProd group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_dot_prod_q15.c
* Description: Q15 dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,103 +26,147 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup dot_prod
* @{
@addtogroup BasicDotProd
@{
*/
/**
* @brief Dot product of Q15 vectors.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] *result output result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
* results are added to a 64-bit accumulator in 34.30 format.
* Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
* there is no risk of overflow.
* The return result is in 34.30 format.
@brief Dot product of Q15 vectors.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] blockSize number of samples in each vector
@param[out] result output result returned here
@return none
@par Scaling and Overflow Behavior
The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
results are added to a 64-bit accumulator in 34.30 format.
Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
there is no risk of overflow.
The return result is in 34.30 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_dot_prod_q15(
q15_t * pSrcA,
q15_t * pSrcB,
uint32_t blockSize,
q63_t * result)
const q15_t * pSrcA,
const q15_t * pSrcB,
uint32_t blockSize,
q63_t * result)
{
q63_t sum = 0; /* Temporary result storage */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* loop counters */
q15x8_t vecA;
q15x8_t vecB;
q63_t sum = 0LL;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);
sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the results in a temporary buffer. */
sum = __SMLALD(*pSrcA++, *pSrcB++, sum);
/* Decrement the loop counter */
blkCnt--;
}
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
* Calculate dot product and then store the result in a temporary buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
sum = vmlaldavaq(sum, vecA, vecB);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
sum = vmlaldavaq_p(sum, vecA, vecB, p0);
}
*result = sum;
}
#else
void arm_dot_prod_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
uint32_t blockSize,
q63_t * result)
{
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary return variable */
/* Run the below code for Cortex-M0 */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
#if defined (ARM_MATH_DSP)
/* Calculate dot product and store result in a temporary buffer. */
sum = __SMLALD(read_q15x2_ia (&pSrcA), read_q15x2_ia (&pSrcB), sum);
sum = __SMLALD(read_q15x2_ia (&pSrcA), read_q15x2_ia (&pSrcB), sum);
#else
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the results in a temporary buffer. */
sum += (q63_t) ((q31_t) * pSrcA++ * *pSrcB++);
/* Decrement the loop counter */
/* Calculate dot product and store result in a temporary buffer. */
//#if defined (ARM_MATH_DSP)
// sum = __SMLALD(*pSrcA++, *pSrcB++, sum);
//#else
sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
//#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the result in the destination buffer in 34.30 format */
/* Store result in destination buffer in 34.30 format */
*result = sum;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of dot_prod group
@} end of BasicDotProd group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_dot_prod_q31.c
* Description: Q31 dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,106 +26,149 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup dot_prod
* @{
@addtogroup BasicDotProd
@{
*/
/**
* @brief Dot product of Q31 vectors.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] *result output result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
* are truncated to 2.48 format by discarding the lower 14 bits.
* The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
* There are 15 guard bits in the accumulator and there is no risk of overflow as long as
* the length of the vectors is less than 2^16 elements.
* The return result is in 16.48 format.
@brief Dot product of Q31 vectors.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[in] blockSize number of samples in each vector.
@param[out] result output result returned here.
@return none
@par Scaling and Overflow Behavior
The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
are truncated to 2.48 format by discarding the lower 14 bits.
The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
There are 15 guard bits in the accumulator and there is no risk of overflow as long as
the length of the vectors is less than 2^16 elements.
The return result is in 16.48 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_dot_prod_q31(
q31_t * pSrcA,
q31_t * pSrcB,
uint32_t blockSize,
q63_t * result)
const q31_t * pSrcA,
const q31_t * pSrcB,
uint32_t blockSize,
q63_t * result)
{
q63_t sum = 0; /* Temporary result storage */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* loop counters */
q31x4_t vecA;
q31x4_t vecB;
q63_t sum = 0LL;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
* Calculate dot product and then store the result in a temporary buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
sum = vrmlaldavhaq(sum, vecA, vecB);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
}
/*
* tail
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
sum = vrmlaldavhaq_p(sum, vecA, vecB, p0);
}
#if defined (ARM_MATH_DSP)
/*
* vrmlaldavhaq provides extra intermediate accumulator headroom.
* limiting the need of intermediate scaling
* Scalar variant uses 2.48 accu format by right shifting accumulators by 14.
* 16.48 output conversion is performed outside the loop by scaling accu. by 6
*/
*result = asrl(sum, (14 - 8));
}
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inA3, inA4;
q31_t inB1, inB2, inB3, inB4;
#else
void arm_dot_prod_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
uint32_t blockSize,
q63_t * result)
{
uint32_t blkCnt; /* Loop counter */
q63_t sum = 0; /* Temporary return variable */
/*loop Unrolling */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
inA1 = *pSrcA++;
inA2 = *pSrcA++;
inA3 = *pSrcA++;
inA4 = *pSrcA++;
inB1 = *pSrcB++;
inB2 = *pSrcB++;
inB3 = *pSrcB++;
inB4 = *pSrcB++;
sum += ((q63_t) inA1 * inB1) >> 14U;
sum += ((q63_t) inA2 * inB2) >> 14U;
sum += ((q63_t) inA3 * inB3) >> 14U;
sum += ((q63_t) inA4 * inB4) >> 14U;
/* Calculate dot product and store result in a temporary buffer. */
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
/* Decrement the loop counter */
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
sum += ((q63_t) * pSrcA++ * *pSrcB++) >> 14U;
/* Decrement the loop counter */
/* Calculate dot product and store result in a temporary buffer. */
sum += ((q63_t) *pSrcA++ * *pSrcB++) >> 14U;
/* Decrement loop counter */
blkCnt--;
}
/* Store the result in the destination buffer in 16.48 format */
/* Store result in destination buffer in 16.48 format */
*result = sum;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of dot_prod group
@} end of BasicDotProd group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_dot_prod_q7.c
* Description: Q7 dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,64 +26,112 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup dot_prod
* @{
@addtogroup BasicDotProd
@{
*/
/**
* @brief Dot product of Q7 vectors.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] *result output result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these
* results are added to an accumulator in 18.14 format.
* Nonsaturating additions are used and there is no danger of wrap around as long as
* the vectors are less than 2^18 elements long.
* The return result is in 18.14 format.
@brief Dot product of Q7 vectors.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] blockSize number of samples in each vector
@param[out] result output result returned here
@return none
@par Scaling and Overflow Behavior
The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these
results are added to an accumulator in 18.14 format.
Nonsaturating additions are used and there is no danger of wrap around as long as
the vectors are less than 2^18 elements long.
The return result is in 18.14 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_dot_prod_q7(
q7_t * pSrcA,
q7_t * pSrcB,
uint32_t blockSize,
q31_t * result)
const q7_t * pSrcA,
const q7_t * pSrcB,
uint32_t blockSize,
q31_t * result)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* loop counters */
q7x16_t vecA;
q7x16_t vecB;
q31_t sum = 0;
q31_t sum = 0; /* Temporary variables to store output */
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
* Calculate dot product and then store the result in a temporary buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
sum = vmladavaq(sum, vecA, vecB);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 16;
pSrcB += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
sum = vmladavaq_p(sum, vecA, vecB, p0);
}
*result = sum;
}
#else
void arm_dot_prod_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
uint32_t blockSize,
q31_t * result)
{
uint32_t blkCnt; /* Loop counter */
q31_t sum = 0; /* Temporary return variable */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t input1, input2; /* Temporary variables */
q31_t inA1, inA2, inB1, inB2; /* Temporary variables */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t input1, input2; /* Temporary variables to store input */
q31_t inA1, inA2, inB1, inB2; /* Temporary variables to store input */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
#if defined (ARM_MATH_DSP)
/* read 4 samples at a time from sourceA */
input1 = *__SIMD32(pSrcA)++;
input1 = read_q7x4_ia (&pSrcA);
/* read 4 samples at a time from sourceB */
input2 = *__SIMD32(pSrcB)++;
input2 = read_q7x4_ia (&pSrcB);
/* extract two q7_t samples to q15_t samples */
inA1 = __SXTB16(__ROR(input1, 8));
@@ -97,51 +145,47 @@ void arm_dot_prod_q7(
/* multiply and accumulate two samples at a time */
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
#else
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
#endif
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Dot product and then store the results in a temporary buffer. */
sum = __SMLAD(*pSrcA++, *pSrcB++, sum);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Dot product and then store the results in a temporary buffer. */
sum += (q31_t) ((q15_t) * pSrcA++ * *pSrcB++);
/* Decrement the loop counter */
/* Calculate dot product and store result in a temporary buffer. */
//#if defined (ARM_MATH_DSP)
// sum = __SMLAD(*pSrcA++, *pSrcB++, sum);
//#else
sum += (q31_t) ((q15_t) *pSrcA++ * *pSrcB++);
//#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the result in the destination buffer in 18.14 format */
/* Store result in destination buffer in 18.14 format */
*result = sum;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of dot_prod group
@} end of BasicDotProd group
*/

View File

@@ -0,0 +1,171 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mult_f16.c
* Description: Floating-point vector multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicMult Vector Multiplication
Element-by-element multiplication of two vectors.
<pre>
pDst[n] = pSrcA[n] * pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicMult
@{
*/
/**
@brief Floating-point vector multiplication.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mult_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t vec2;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
res = vmulq(vec1, vec2);
vst1q(pDst, res);
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
/* C = A + B */
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
vstrhq_p(pDst, vmulq(vec1,vec2), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_mult_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply inputs and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply input and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicMult group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mult_f32.c
* Description: Floating-point vector multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,137 +26,175 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup BasicMult Vector Multiplication
*
* Element-by-element multiplication of two vectors.
*
* <pre>
* pDst[n] = pSrcA[n] * pSrcB[n], 0 <= n < blockSize.
* </pre>
*
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
@defgroup BasicMult Vector Multiplication
Element-by-element multiplication of two vectors.
<pre>
pDst[n] = pSrcA[n] * pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup BasicMult
* @{
@addtogroup BasicMult
@{
*/
/**
* @brief Floating-point vector multiplication.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
@brief Floating-point vector multiplication.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mult_f32(
float32_t * pSrcA,
float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
#if defined (ARM_MATH_DSP)
uint32_t blkCnt; /* Loop counter */
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t inA1, inA2, inA3, inA4; /* temporary input variables */
float32_t inB1, inB2, inB3, inB4; /* temporary input variables */
float32_t out1, out2, out3, out4; /* temporary output variables */
f32x4_t vec1;
f32x4_t vec2;
f32x4_t res;
/* loop Unrolling */
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
res = vmulq(vec1, vec2);
vst1q(pDst, res);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
if (blkCnt > 0U)
{
/* C = A + B */
mve_pred16_t p0 = vctp32q(blkCnt);
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
vstrwq_p(pDst, vmulq(vec1,vec2), p0);
}
}
#else
void arm_mult_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
f32x4_t vec1;
f32x4_t vec2;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrcA);
vec2 = vld1q_f32(pSrcB);
res = vmulq_f32(vec1, vec2);
vst1q_f32(pDst, res);
/* Increment pointers */
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the results in output buffer */
/* read sample from sourceA */
inA1 = *pSrcA;
/* read sample from sourceB */
inB1 = *pSrcB;
/* read sample from sourceA */
inA2 = *(pSrcA + 1);
/* read sample from sourceB */
inB2 = *(pSrcB + 1);
/* out = sourceA * sourceB */
out1 = inA1 * inB1;
/* Multiply inputs and store result in destination buffer. */
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* read sample from sourceA */
inA3 = *(pSrcA + 2);
/* read sample from sourceB */
inB3 = *(pSrcB + 2);
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* out = sourceA * sourceB */
out2 = inA2 * inB2;
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* read sample from sourceA */
inA4 = *(pSrcA + 3);
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* store result to destination buffer */
*pDst = out1;
/* read sample from sourceB */
inB4 = *(pSrcB + 3);
/* out = sourceA * sourceB */
out3 = inA3 * inB3;
/* store result to destination buffer */
*(pDst + 1) = out2;
/* out = sourceA * sourceB */
out4 = inA4 * inB4;
/* store result to destination buffer */
*(pDst + 2) = out3;
/* store result to destination buffer */
*(pDst + 3) = out4;
/* update pointers to process next samples */
pSrcA += 4U;
pSrcB += 4U;
pDst += 4U;
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the results in output buffer */
/* Multiply input and store result in destination buffer. */
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of BasicMult group
@} end of BasicMult group
*/

View File

@@ -0,0 +1,75 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mult_f64.c
* Description: Floating-point vector multiplication
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicMult
@{
*/
/**
@brief Floating-point vector multiplication.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
void arm_mult_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply input and store result in destination buffer. */
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicMult group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mult_q15.c
* Description: Q15 vector multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,68 +26,116 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicMult
* @{
@addtogroup BasicMult
@{
*/
/**
* @brief Q15 vector multiplication
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
@brief Q15 vector multiplication
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mult_q15(
q15_t * pSrcA,
q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
uint32_t blkCnt; /* loop counters */
q15x8_t vecA, vecB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = A * B
* Multiply the inputs and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqdmulhq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
pDst += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrhq_p(pDst, vqdmulhq(vecA, vecB), p0);
}
}
#else
void arm_mult_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t inA1, inA2, inB1, inB2; /* Temporary input variables */
q15_t out1, out2, out3, out4; /* Temporary output variables */
q31_t mul1, mul2, mul3, mul4; /* Temporary variables */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inB1, inB2; /* temporary input variables */
q15_t out1, out2, out3, out4; /* temporary output variables */
q31_t mul1, mul2, mul3, mul4; /* temporary variables */
/* loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* read two samples at a time from sourceA */
inA1 = *__SIMD32(pSrcA)++;
/* read two samples at a time from sourceB */
inB1 = *__SIMD32(pSrcB)++;
/* read two samples at a time from sourceA */
inA2 = *__SIMD32(pSrcA)++;
/* read two samples at a time from sourceB */
inB2 = *__SIMD32(pSrcB)++;
/* C = A * B */
#if defined (ARM_MATH_DSP)
/* read 2 samples at a time from sourceA */
inA1 = read_q15x2_ia (&pSrcA);
/* read 2 samples at a time from sourceB */
inB1 = read_q15x2_ia (&pSrcB);
/* read 2 samples at a time from sourceA */
inA2 = read_q15x2_ia (&pSrcA);
/* read 2 samples at a time from sourceB */
inB2 = read_q15x2_ia (&pSrcB);
/* multiply mul = sourceA * sourceB */
mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
mul2 = (q31_t) ((q15_t) inA1 * (q15_t) inB1);
mul2 = (q31_t) ((q15_t) (inA1 ) * (q15_t) (inB1 ));
mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
mul4 = (q31_t) ((q15_t) inA2 * (q15_t) inB2);
mul4 = (q31_t) ((q15_t) (inA2 ) * (q15_t) (inB2 ));
/* saturate result to 16 bit */
out1 = (q15_t) __SSAT(mul1 >> 15, 16);
@@ -95,48 +143,50 @@ void arm_mult_q15(
out3 = (q15_t) __SSAT(mul3 >> 15, 16);
out4 = (q15_t) __SSAT(mul4 >> 15, 16);
/* store the result */
/* store result to destination */
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
*__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
write_q15x2_ia (&pDst, __PKHBT(out2, out1, 16));
write_q15x2_ia (&pDst, __PKHBT(out4, out3, 16));
#else
*__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
*__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
write_q15x2_ia (&pDst, __PKHBT(out1, out2, 16));
write_q15x2_ia (&pDst, __PKHBT(out3, out4, 16));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Decrement the blockSize loop counter */
#else
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the result in the destination buffer */
/* Multiply inputs and store result in destination buffer. */
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of BasicMult group
@} end of BasicMult group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mult_q31.c
* Description: Q31 vector multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,123 +26,143 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicMult
* @{
@addtogroup BasicMult
@{
*/
/**
* @brief Q31 vector multiplication.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
@brief Q31 vector multiplication.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mult_q31(
q31_t * pSrcA,
q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
uint32_t blkCnt; /* loop counters */
q31x4_t vecA, vecB;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inA3, inA4; /* temporary input variables */
q31_t inB1, inB2, inB3, inB4; /* temporary input variables */
q31_t out1, out2, out3, out4; /* temporary output variables */
/* loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and then store the results in the destination buffer. */
inA1 = *pSrcA++;
inA2 = *pSrcA++;
inA3 = *pSrcA++;
inA4 = *pSrcA++;
inB1 = *pSrcB++;
inB2 = *pSrcB++;
inB3 = *pSrcB++;
inB4 = *pSrcB++;
out1 = ((q63_t) inA1 * inB1) >> 32;
out2 = ((q63_t) inA2 * inB2) >> 32;
out3 = ((q63_t) inA3 * inB3) >> 32;
out4 = ((q63_t) inA4 * inB4) >> 32;
out1 = __SSAT(out1, 31);
out2 = __SSAT(out2, 31);
out3 = __SSAT(out3, 31);
out4 = __SSAT(out4, 31);
*pDst++ = out1 << 1U;
*pDst++ = out2 << 1U;
*pDst++ = out3 << 1U;
*pDst++ = out4 << 1U;
/* Decrement the blockSize loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and then store the results in the destination buffer. */
inA1 = *pSrcA++;
inB1 = *pSrcB++;
out1 = ((q63_t) inA1 * inB1) >> 32;
out1 = __SSAT(out1, 31);
*pDst++ = out1 << 1U;
/* Decrement the blockSize loop counter */
blkCnt--;
}
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* C = A * B
* Multiply the inputs and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqdmulhq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
pDst += 4;
}
/*
* tail
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrwq_p(pDst, vqdmulhq(vecA, vecB), p0);
}
}
#else
void arm_mult_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q31_t out; /* Temporary output variable */
/* Run the below code for Cortex-M0 */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply inputs and store result in destination buffer. */
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and then store the results in the destination buffer. */
*pDst++ =
(q31_t) clip_q63_to_q31(((q63_t) (*pSrcA++) * (*pSrcB++)) >> 31);
/* Decrement the blockSize loop counter */
/* Multiply inputs and store result in destination buffer. */
out = ((q63_t) *pSrcA++ * *pSrcB++) >> 32;
out = __SSAT(out, 31);
*pDst++ = out << 1U;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of BasicMult group
@} end of BasicMult group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mult_q7.c
* Description: Q7 vector multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,90 +26,143 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup BasicMult
* @{
@addtogroup BasicMult
@{
*/
/**
* @brief Q7 vector multiplication
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
@brief Q7 vector multiplication
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mult_q7(
q7_t * pSrcA,
q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
uint32_t blkCnt; /* loop counters */
q7x16_t vecA, vecB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = A * B
* Multiply the inputs and then store the results in the destination buffer.
*/
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vst1q(pDst, vqdmulhq(vecA, vecB));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrcA += 16;
pSrcB += 16;
pDst += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vstrbq_p(pDst, vqdmulhq(vecA, vecB), p0);
}
}
#else
void arm_mult_q7(
const q7_t * pSrcA,
const q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q7_t out1, out2, out3, out4; /* Temporary output variables */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q7_t out1, out2, out3, out4; /* Temporary variables to store the product */
/* loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the results in temporary variables */
#if defined (ARM_MATH_DSP)
/* Multiply inputs and store results in temporary variables */
out1 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out2 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out3 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out4 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
/* Store the results of 4 inputs in the destination buffer in single cycle by packing */
*__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
/* Pack and store result in destination buffer (in single write) */
write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
#else
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
#endif
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the result in the destination buffer */
/* Multiply input and store result in destination buffer. */
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of BasicMult group
@} end of BasicMult group
*/

View File

@@ -0,0 +1,166 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_negate_f16.c
* Description: Negates floating-point vectors
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicNegate Vector Negate
Negates the elements of a vector.
<pre>
pDst[n] = -pSrc[n], 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicNegate
@{
*/
/**
@brief Negates the elements of a floating-point vector.
@param[in] pSrc points to input vector.
@param[out] pDst points to output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_negate_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vnegq(vec1);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
/* C = |A| */
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q((float16_t const *) pSrc);
vstrhq_p(pDst, vnegq(vec1), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_negate_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = -(_Float16)*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
*pDst++ = -(_Float16)*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicNegate group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_negate_f32.c
* Description: Negates floating-point vectors
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,109 +26,167 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup negate Vector Negate
*
* Negates the elements of a vector.
*
* <pre>
* pDst[n] = -pSrc[n], 0 <= n < blockSize.
* </pre>
*
* The functions support in-place computation allowing the source and
* destination pointers to reference the same memory buffer.
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
@defgroup BasicNegate Vector Negate
Negates the elements of a vector.
<pre>
pDst[n] = -pSrc[n], 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup negate
* @{
@addtogroup BasicNegate
@{
*/
/**
* @brief Negates the elements of a floating-point vector.
* @param[in] *pSrc points to the input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
@brief Negates the elements of a floating-point vector.
@param[in] pSrc points to input vector.
@param[out] pDst points to output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_negate_f32(
float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
f32x4_t vec1;
f32x4_t res;
#if defined (ARM_MATH_DSP)
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t in1, in2, in3, in4; /* temporary variables */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vnegq(vec1);
vst1q(pDst, res);
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* read inputs from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
in4 = *(pSrc + 3);
/* Tail */
blkCnt = blockSize & 0x3;
if (blkCnt > 0U)
{
/* C = |A| */
mve_pred16_t p0 = vctp32q(blkCnt);
vec1 = vld1q((float32_t const *) pSrc);
vstrwq_p(pDst, vnegq(vec1), p0);
}
/* negate the input */
in1 = -in1;
in2 = -in2;
in3 = -in3;
in4 = -in4;
/* store the result to destination */
*pDst = in1;
*(pDst + 1) = in2;
*(pDst + 2) = in3;
*(pDst + 3) = in4;
/* update pointers to process next samples */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
}
#else
void arm_negate_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Run the below code for Cortex-M0 */
#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
f32x4_t vec1;
f32x4_t res;
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vnegq_f32(vec1);
vst1q_f32(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the results in the destination buffer. */
/* Negate and store result in destination buffer. */
*pDst++ = -*pSrc++;
/* Decrement the loop counter */
*pDst++ = -*pSrc++;
*pDst++ = -*pSrc++;
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of negate group
@} end of BasicNegate group
*/

View File

@@ -0,0 +1,73 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_negate_f64.c
* Description: Negates floating-point vectors
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicNegate
@{
*/
/**
@brief Negates the elements of a floating-point vector.
@param[in] pSrc points to input vector.
@param[out] pDst points to output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
void arm_negate_f64(
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicNegate group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_negate_q15.c
* Description: Negates Q15 vectors
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,106 +26,146 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup negate
* @{
@addtogroup BasicNegate
@{
*/
/**
* @brief Negates the elements of a Q15 vector.
* @param[in] *pSrc points to the input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* \par Conditions for optimum performance
* Input and output buffers should be aligned by 32-bit
*
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
@brief Negates the elements of a Q15 vector.
@param[in] pSrc points to the input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
@par Conditions for optimum performance
Input and output buffers should be aligned by 32-bit
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_negate_q15(
q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
q15_t in;
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = -A
* Negate and then store the results in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqnegq(vecSrc));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 8;
pDst += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrc = vld1q(pSrc);
vstrhq_p(pDst, vqnegq(vecSrc), p0);
}
}
#else
void arm_negate_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q15_t in; /* Temporary input variable */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t in1; /* Temporary input variables */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2; /* Temporary variables */
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = -A */
/* Read two inputs at a time */
in1 = _SIMD32_OFFSET(pSrc);
in2 = _SIMD32_OFFSET(pSrc + 2);
/* negate two samples at a time */
in1 = __QSUB16(0, in1);
#if defined (ARM_MATH_DSP)
/* Negate and store result in destination buffer (2 samples at a time). */
in1 = read_q15x2_ia (&pSrc);
write_q15x2_ia (&pDst, __QSUB16(0, in1));
/* negate two samples at a time */
in2 = __QSUB16(0, in2);
in1 = read_q15x2_ia (&pSrc);
write_q15x2_ia (&pDst, __QSUB16(0, in1));
#else
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
/* store the result to destination 2 samples at a time */
_SIMD32_OFFSET(pDst) = in1;
/* store the result to destination 2 samples at a time */
_SIMD32_OFFSET(pDst + 2) = in2;
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
/* update pointers to process next samples */
pSrc += 4U;
pDst += 4U;
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
#endif
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the result in the destination buffer. */
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? 0x7fff : -in;
/* Decrement the loop counter */
/* Negate and store result in destination buffer. */
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in;
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of negate group
@} end of BasicNegate group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_negate_q31.c
* Description: Negates Q31 vectors
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,92 +26,153 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup negate
* @{
@addtogroup BasicNegate
@{
*/
/**
* @brief Negates the elements of a Q31 vector.
* @param[in] *pSrc points to the input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
@brief Negates the elements of a Q31 vector.
@param[in] pSrc points to the input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_negate_q31(
q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
q31_t in; /* Temporary variable */
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* loop counters */
q31x4_t vecSrc;
#if defined (ARM_MATH_DSP)
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* C = -A
* Negate and then store the results in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqnegq(vecSrc));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 4;
pDst += 4;
}
/*
* tail
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrc = vld1q(pSrc);
vstrwq_p(pDst, vqnegq(vecSrc), p0);
}
}
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2, in3, in4;
#else
void arm_negate_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q31_t in; /* Temporary input variable */
/*loop Unrolling */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the results in the destination buffer. */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
*pDst++ = __QSUB(0, in1);
*pDst++ = __QSUB(0, in2);
*pDst++ = __QSUB(0, in3);
*pDst++ = __QSUB(0, in4);
/* Negate and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* Decrement the loop counter */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the result in the destination buffer. */
in = *pSrc++;
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
/* Decrement the loop counter */
/* Negate and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of negate group
@} end of BasicNegate group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_negate_q7.c
* Description: Negates Q7 vectors
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,88 +26,146 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup negate
* @{
@addtogroup BasicNegate
@{
*/
/**
* @brief Negates the elements of a Q7 vector.
* @param[in] *pSrc points to the input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
@brief Negates the elements of a Q7 vector.
@param[in] pSrc points to the input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q7 value -1 (0x80) is saturated to the maximum allowable positive value 0x7F.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_negate_q7(
q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
q7_t in;
uint32_t blkCnt; /* loop counters */
q7x16_t vecSrc;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = -A
* Negate and then store the results in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqnegq(vecSrc));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 16;
pDst += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrc = vld1q(pSrc);
vstrbq_p(pDst, vqnegq(vecSrc), p0);
}
}
#else
void arm_negate_q7(
const q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
q7_t in; /* Temporary input variable */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t in1; /* Temporary input variable */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t input; /* Input values1-4 */
q31_t zero = 0x00000000;
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = -A */
/* Read four inputs */
input = *__SIMD32(pSrc)++;
/* Store the Negated results in the destination buffer in a single cycle by packing the results */
*__SIMD32(pDst)++ = __QSUB8(zero, input);
#if defined (ARM_MATH_DSP)
/* Negate and store result in destination buffer (4 samples at a time). */
in1 = read_q7x4_ia (&pSrc);
write_q7x4_ia (&pDst, __QSUB8(0, in1));
#else
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
/* Decrement the loop counter */
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the results in the destination buffer. */ \
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? 0x7f : -in;
/* Decrement the loop counter */
/* Negate and store result in destination buffer. */
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = (q7_t) __QSUB8(0, in);
#else
*pDst++ = (in == (q7_t) 0x80) ? (q7_t) 0x7f : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of negate group
@} end of BasicNegate group
*/

View File

@@ -0,0 +1,130 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_not_u16.c
* Description: uint16_t bitwise NOT
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup Not Vector bitwise NOT
Compute the logical bitwise NOT.
There are separate functions for uint32_t, uint16_t, and uint8_t data types.
*/
/**
@addtogroup Not
@{
*/
/**
@brief Compute the logical bitwise NOT of a fixed-point vector.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_not_u16(
const uint16_t * pSrc,
uint16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecSrc;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrc);
vst1q(pDst, vmvnq_u16(vecSrc) );
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrc = vld1q(pSrc);
vstrhq_p(pDst, vmvnq_u16(vecSrc), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t inV;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
inV = vld1q_u16(pSrc);
vst1q_u16(pDst, vmvnq_u16(inV) );
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = ~(*pSrc++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Not group
*/

View File

@@ -0,0 +1,122 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_not_u32.c
* Description: uint32_t bitwise NOT
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup Not
@{
*/
/**
@brief Compute the logical bitwise NOT of a fixed-point vector.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_not_u32(
const uint32_t * pSrc,
uint32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecSrc;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrc);
vst1q(pDst, vmvnq_u32(vecSrc) );
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrc = vld1q(pSrc);
vstrwq_p(pDst, vmvnq_u32(vecSrc), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t inV;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
inV = vld1q_u32(pSrc);
vst1q_u32(pDst, vmvnq_u32(inV) );
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = ~(*pSrc++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Not group
*/

View File

@@ -0,0 +1,122 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_not_u8.c
* Description: uint8_t bitwise NOT
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup Not
@{
*/
/**
@brief Compute the logical bitwise NOT of a fixed-point vector.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_not_u8(
const uint8_t * pSrc,
uint8_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecSrc;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrc);
vst1q(pDst, vmvnq_u8(vecSrc) );
pSrc += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrc = vld1q(pSrc);
vstrbq_p(pDst, vmvnq_u8(vecSrc), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t inV;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4U;
while (blkCnt > 0U)
{
inV = vld1q_u8(pSrc);
vst1q_u8(pDst, vmvnq_u8(inV) );
pSrc += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = ~(*pSrc++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Not group
*/

View File

@@ -0,0 +1,170 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_offset_f16.c
* Description: Floating-point vector offset
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicOffset Vector Offset
Adds a constant offset to each element of a vector.
<pre>
pDst[n] = pSrc[n] + offset, 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicOffset
@{
*/
/**
@brief Adds a constant offset to a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_offset_f16(
const float16_t * pSrc,
float16_t offset,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vaddq(vec1,offset);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q((float16_t const *) pSrc);
vstrhq_p(pDst, vaddq(vec1, offset), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_offset_f16(
const float16_t * pSrc,
float16_t offset,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
/* Decrement loop counter */
blkCnt--;
}
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicOffset group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_offset_f32.c
* Description: Floating-point vector offset
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,129 +26,171 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @defgroup offset Vector Offset
*
* Adds a constant offset to each element of a vector.
*
* <pre>
* pDst[n] = pSrc[n] + offset, 0 <= n < blockSize.
* </pre>
*
* The functions support in-place computation allowing the source and
* destination pointers to reference the same memory buffer.
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
@defgroup BasicOffset Vector Offset
Adds a constant offset to each element of a vector.
<pre>
pDst[n] = pSrc[n] + offset, 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup offset
* @{
@addtogroup BasicOffset
@{
*/
/**
* @brief Adds a constant offset to a floating-point vector.
* @param[in] *pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
@brief Adds a constant offset to a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_offset_f32(
float32_t * pSrc,
float32_t offset,
float32_t * pDst,
uint32_t blockSize)
const float32_t * pSrc,
float32_t offset,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
f32x4_t vec1;
f32x4_t res;
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t in1, in2, in3, in4;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vaddq(vec1,offset);
vst1q(pDst, res);
/*loop Unrolling */
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vec1 = vld1q((float32_t const *) pSrc);
vstrwq_p(pDst, vaddq(vec1, offset), p0);
}
}
#else
void arm_offset_f32(
const float32_t * pSrc,
float32_t offset,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
f32x4_t vec1;
f32x4_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
vec1 = vld1q_f32(pSrc);
res = vaddq_f32(vec1,vdupq_n_f32(offset));
vst1q_f32(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
/* read samples from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
/* add offset to input */
in1 = in1 + offset;
/* Add offset and store result in destination buffer. */
*pDst++ = (*pSrc++) + offset;
/* read samples from source */
in3 = *(pSrc + 2);
*pDst++ = (*pSrc++) + offset;
/* add offset to input */
in2 = in2 + offset;
*pDst++ = (*pSrc++) + offset;
/* read samples from source */
in4 = *(pSrc + 3);
*pDst++ = (*pSrc++) + offset;
/* add offset to input */
in3 = in3 + offset;
/* store result to destination */
*pDst = in1;
/* add offset to input */
in4 = in4 + offset;
/* store result to destination */
*(pDst + 1) = in2;
/* store result to destination */
*(pDst + 2) = in3;
/* store result to destination */
*(pDst + 3) = in4;
/* update pointers to process next samples */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON_EXPERIMENTAL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
/* Add offset and store result in destination buffer. */
*pDst++ = (*pSrc++) + offset;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of offset group
@} end of BasicOffset group
*/

View File

@@ -0,0 +1,75 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_offset_f64.c
* Description: Floating-point vector offset
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicOffset
@{
*/
/**
@brief Adds a constant offset to a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_offset_f64(
const float64_t * pSrc,
float64_t offset,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
*pDst++ = (*pSrc++) + offset;
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicOffset group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_offset_q15.c
* Description: Q15 vector offset
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,99 +26,143 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup offset
* @{
@addtogroup BasicOffset
@{
*/
/**
* @brief Adds a constant offset to a Q15 vector.
* @param[in] *pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
@brief Adds a constant offset to a Q15 vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_offset_q15(
q15_t * pSrc,
q15_t offset,
q15_t * pDst,
uint32_t blockSize)
const q15_t * pSrc,
q15_t offset,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* loop counters */
q15x8_t vecSrc;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
/*
* C = A + offset
* Add offset and then store the result in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqaddq(vecSrc, offset));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 8;
pDst += 8;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrc = vld1q(pSrc);
vstrhq_p(pDst, vqaddq(vecSrc, offset), p0);
}
}
#else
void arm_offset_q15(
const q15_t * pSrc,
q15_t offset,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t offset_packed; /* Offset packed to 32 bit */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* Offset is packed to 32 bit in order to use SIMD32 for addition */
offset_packed = __PKHBT(offset, offset, 16);
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer, 2 samples at a time. */
*__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);
*__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);
/* Decrement the loop counter */
#if defined (ARM_MATH_DSP)
/* Add offset and store result in destination buffer (2 samples at a time). */
write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia (&pSrc), offset_packed));
write_q15x2_ia (&pDst, __QADD16(read_q15x2_ia (&pSrc), offset_packed));
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
*pDst++ = (q15_t) __QADD16(*pSrc++, offset);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
*pDst++ = (q15_t) __SSAT(((q31_t) * pSrc++ + offset), 16);
/* Decrement the loop counter */
/* Add offset and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pDst++ = (q15_t) __QADD16(*pSrc++, offset);
#else
*pDst++ = (q15_t) __SSAT(((q31_t) *pSrc++ + offset), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of offset group
@} end of BasicOffset group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_offset_q31.c
* Description: Q31 vector offset
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,103 +26,134 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup offset
* @{
@addtogroup BasicOffset
@{
*/
/**
* @brief Adds a constant offset to a Q31 vector.
* @param[in] *pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
@brief Adds a constant offset to a Q31 vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_offset_q31(
q31_t * pSrc,
q31_t offset,
q31_t * pDst,
uint32_t blockSize)
const q31_t * pSrc,
q31_t offset,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* loop counters */
q31x4_t vecSrc;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2, in3, in4;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
*pDst++ = __QADD(in1, offset);
*pDst++ = __QADD(in2, offset);
*pDst++ = __QADD(in3, offset);
*pDst++ = __QADD(in4, offset);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
*pDst++ = __QADD(*pSrc++, offset);
/* Decrement the loop counter */
blkCnt--;
}
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
/*
* C = A + offset
* Add offset and then store the result in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqaddq(vecSrc, offset));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 4;
pDst += 4;
}
/*
* tail
*/
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrc = vld1q(pSrc);
vstrwq_p(pDst, vqaddq(vecSrc, offset), p0);
}
}
#else
void arm_offset_q31(
const q31_t * pSrc,
q31_t offset,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Run the below code for Cortex-M0 */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
*pDst++ = __QADD(*pSrc++, offset);
*pDst++ = __QADD(*pSrc++, offset);
*pDst++ = __QADD(*pSrc++, offset);
*pDst++ = __QADD(*pSrc++, offset);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
/* Decrement the loop counter */
/* Add offset and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pDst++ = __QADD(*pSrc++, offset);
#else
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of offset group
@} end of BasicOffset group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_offset_q7.c
* Description: Q7 vector offset
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,98 +26,137 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/basic_math_functions.h"
/**
* @ingroup groupMath
@ingroup groupMath
*/
/**
* @addtogroup offset
* @{
@addtogroup BasicOffset
@{
*/
/**
* @brief Adds a constant offset to a Q7 vector.
* @param[in] *pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
@brief Adds a constant offset to a Q7 vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_offset_q7(
q7_t * pSrc,
q7_t offset,
q7_t * pDst,
uint32_t blockSize)
const q7_t * pSrc,
q7_t offset,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint32_t blkCnt; /* loop counters */
q7x16_t vecSrc;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
/*
* C = A + offset
* Add offset and then store the result in the destination buffer.
*/
vecSrc = vld1q(pSrc);
vst1q(pDst, vqaddq(vecSrc, offset));
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
/*
* advance vector source and destination pointers
*/
pSrc += 16;
pDst += 16;
}
/*
* tail
*/
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrc = vld1q(pSrc);
vstrbq_p(pDst, vqaddq(vecSrc, offset), p0);
}
}
#else
void arm_offset_q7(
const q7_t * pSrc,
q7_t offset,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t offset_packed; /* Offset packed to 32 bit */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* Offset is packed to 32 bit in order to use SIMD32 for addition */
offset_packed = __PACKq7(offset, offset, offset, offset);
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination bufferfor 4 samples at a time. */
*__SIMD32(pDst)++ = __QADD8(*__SIMD32(pSrc)++, offset_packed);
/* Decrement the loop counter */
#if defined (ARM_MATH_DSP)
/* Add offset and store result in destination buffer (4 samples at a time). */
write_q7x4_ia (&pDst, __QADD8(read_q7x4_ia (&pSrc), offset_packed));
#else
*pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
*pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
*pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
*pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
*pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) * pSrc++ + offset, 8);
/* Decrement the loop counter */
/* Add offset and store result in destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) *pSrc++ + offset, 8);
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of offset group
@} end of BasicOffset group
*/

Some files were not shown because too many files have changed in this diff Show More