Переструктурирован проект. Сурсы вынесены за этот гит. Здесь осталась только оболочка

также был тест компилятора mingw, следующие ошибки:
- переопределение __INT64_C, __UINT64_C
- и еше какие-то проблемы с intmax_t uintmax_t
This commit is contained in:
2025-03-25 13:26:09 +03:00
parent a043335d9b
commit 4556b453db
1453 changed files with 359122 additions and 280 deletions

View File

@@ -0,0 +1,87 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: BasicMathFunctions.c
* Description: Combination of all basic math function source files.
*
* $Date: 16. March 2020
* $Revision: V1.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_abs_f32.c"
#include "arm_abs_f64.c"
#include "arm_abs_q15.c"
#include "arm_abs_q31.c"
#include "arm_abs_q7.c"
#include "arm_add_f32.c"
#include "arm_add_f64.c"
#include "arm_add_q15.c"
#include "arm_add_q31.c"
#include "arm_add_q7.c"
#include "arm_and_u16.c"
#include "arm_and_u32.c"
#include "arm_and_u8.c"
#include "arm_dot_prod_f32.c"
#include "arm_dot_prod_f64.c"
#include "arm_dot_prod_q15.c"
#include "arm_dot_prod_q31.c"
#include "arm_dot_prod_q7.c"
#include "arm_mult_f32.c"
#include "arm_mult_f64.c"
#include "arm_mult_q15.c"
#include "arm_mult_q31.c"
#include "arm_mult_q7.c"
#include "arm_negate_f32.c"
#include "arm_negate_f64.c"
#include "arm_negate_q15.c"
#include "arm_negate_q31.c"
#include "arm_negate_q7.c"
#include "arm_not_u16.c"
#include "arm_not_u32.c"
#include "arm_not_u8.c"
#include "arm_offset_f32.c"
#include "arm_offset_f64.c"
#include "arm_offset_q15.c"
#include "arm_offset_q31.c"
#include "arm_offset_q7.c"
#include "arm_or_u16.c"
#include "arm_or_u32.c"
#include "arm_or_u8.c"
#include "arm_scale_f32.c"
#include "arm_scale_f64.c"
#include "arm_scale_q15.c"
#include "arm_scale_q31.c"
#include "arm_scale_q7.c"
#include "arm_shift_q15.c"
#include "arm_shift_q31.c"
#include "arm_shift_q7.c"
#include "arm_sub_f32.c"
#include "arm_sub_f64.c"
#include "arm_sub_q15.c"
#include "arm_sub_q31.c"
#include "arm_sub_q7.c"
#include "arm_xor_u16.c"
#include "arm_xor_u32.c"
#include "arm_xor_u8.c"
#include "arm_clip_f32.c"
#include "arm_clip_q31.c"
#include "arm_clip_q15.c"
#include "arm_clip_q7.c"

View File

@@ -0,0 +1,37 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: BasicMathFunctionsF16.c
* Description: Combination of all basic math function f16 source files.
*
* $Date: 20. April 2020
* $Revision: V1.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_abs_f16.c"
#include "arm_add_f16.c"
#include "arm_dot_prod_f16.c"
#include "arm_mult_f16.c"
#include "arm_negate_f16.c"
#include "arm_offset_f16.c"
#include "arm_scale_f16.c"
#include "arm_sub_f16.c"
#include "arm_clip_f16.c"

View File

@@ -0,0 +1,41 @@
cmake_minimum_required (VERSION 3.14)
project(CMSISDSPBasicMath)
include(configLib)
include(configDsp)
file(GLOB SRCF64 "./*_f64.c")
file(GLOB SRCF32 "./*_f32.c")
file(GLOB SRCF16 "./*_f16.c")
file(GLOB SRCQ31 "./*_q31.c")
file(GLOB SRCQ15 "./*_q15.c")
file(GLOB SRCQ7 "./*_q7.c")
file(GLOB SRCU32 "./*_u32.c")
file(GLOB SRCU16 "./*_u16.c")
file(GLOB SRCU8 "./*_u8.c")
add_library(CMSISDSPBasicMath STATIC ${SRCF64})
target_sources(CMSISDSPBasicMath PRIVATE ${SRCF32})
if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
target_sources(CMSISDSPBasicMath PRIVATE ${SRCF16})
endif()
target_sources(CMSISDSPBasicMath PRIVATE ${SRCQ31})
target_sources(CMSISDSPBasicMath PRIVATE ${SRCQ15})
target_sources(CMSISDSPBasicMath PRIVATE ${SRCQ7})
target_sources(CMSISDSPBasicMath PRIVATE ${SRCU32})
target_sources(CMSISDSPBasicMath PRIVATE ${SRCU16})
target_sources(CMSISDSPBasicMath PRIVATE ${SRCU8})
configLib(CMSISDSPBasicMath ${ROOT})
configDsp(CMSISDSPBasicMath ${ROOT})
### Includes
target_include_directories(CMSISDSPBasicMath PUBLIC "${DSP}/Include")

View File

@@ -0,0 +1,198 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_abs_f16.c
* Description: Floating-point vector absolute value
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
#include <math.h>
/**
@ingroup groupMath
*/
/**
@defgroup BasicAbs Vector Absolute Value
Computes the absolute value of a vector on an element-by-element basis.
<pre>
pDst[n] = abs(pSrc[n]), 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicAbs
@{
*/
/**
@brief Floating-point vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_abs_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vabsq(vec1);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
/* C = |A| */
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q(pSrc);
vstrhq_p(pDst, vabsq(vec1), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_abs_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
f16x8_t vec1;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q_f16(pSrc);
res = vabsq_f16(vec1);
vst1q_f16(pDst, res);
/* Increment pointers */
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and store result in destination buffer. */
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and store result in destination buffer. */
*pDst++ = (_Float16)fabsf((float32_t)*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_FLOAT16_SUPPORTED */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicAbs group
*/

View File

@@ -0,0 +1,153 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_abs_f32.c
* Description: Floating-point vector absolute value
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
#include <math.h>
/**
* @ingroup groupMath
*/
/**
* @defgroup BasicAbs Vector Absolute Value
*
* Computes the absolute value of a vector on an element-by-element basis.
*
* <pre>
* pDst[n] = abs(pSrc[n]), 0 <= n < blockSize.
* </pre>
*
* The functions support in-place computation allowing the source and
* destination pointers to reference the same memory buffer.
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup BasicAbs
* @{
*/
/**
* @brief Floating-point vector absolute value.
* @param[in] *pSrc points to the input buffer
* @param[out] *pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
* @return none.
*/
void arm_abs_f32(
float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t in1, in2, in3, in4; /* temporary variables */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and then store the results in the destination buffer. */
/* read sample from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
/* find absolute value */
in1 = fabsf(in1);
/* read sample from source */
in4 = *(pSrc + 3);
/* find absolute value */
in2 = fabsf(in2);
/* read sample from source */
*pDst = in1;
/* find absolute value */
in3 = fabsf(in3);
/* find absolute value */
in4 = fabsf(in4);
/* store result to destination */
*(pDst + 1) = in2;
/* store result to destination */
*(pDst + 2) = in3;
/* store result to destination */
*(pDst + 3) = in4;
/* Update source pointer to process next sampels */
pSrc += 4U;
/* Update destination pointer to process next sampels */
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and then store the results in the destination buffer. */
*pDst++ = fabsf(*pSrc++);
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of BasicAbs group
*/

View File

@@ -0,0 +1,74 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_abs_f64.c
* Description: Floating-point vector absolute value
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
#include <math.h>
/**
@ingroup groupMath
*/
/**
@addtogroup BasicAbs
@{
*/
/**
@brief Floating-point vector absolute value.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_abs_f64(
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute and store result in destination buffer. */
*pDst++ = fabs(*pSrc++);
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicAbs group
*/

View File

@@ -0,0 +1,167 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_abs_q15.c
* Description: Q15 vector absolute value
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup BasicAbs
* @{
*/
/**
* @brief Q15 vector absolute value.
* @param[in] *pSrc points to the input buffer
* @param[out] *pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
*/
void arm_abs_q15(
q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
__SIMD32_TYPE *simd;
/* Run the below code for Cortex-M4 and Cortex-M3 */
q15_t in1; /* Input value1 */
q15_t in2; /* Input value2 */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
simd = __SIMD32_CONST(pDst);
while (blkCnt > 0U)
{
/* C = |A| */
/* Read two inputs */
in1 = *pSrc++;
in2 = *pSrc++;
/* Store the Absolute result in the destination buffer by packing the two values, in a single cycle */
#ifndef ARM_MATH_BIG_ENDIAN
*simd++ =
__PKHBT(((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)),
((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)), 16);
#else
*simd++ =
__PKHBT(((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)),
((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
in1 = *pSrc++;
in2 = *pSrc++;
#ifndef ARM_MATH_BIG_ENDIAN
*simd++ =
__PKHBT(((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)),
((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)), 16);
#else
*simd++ =
__PKHBT(((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)),
((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Decrement the loop counter */
blkCnt--;
}
pDst = (q15_t *)simd;
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Read the input */
in1 = *pSrc++;
/* Calculate absolute value of input and then store the result in the destination buffer. */
*pDst++ = (in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
q15_t in; /* Temporary input variable */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = |A| */
/* Read the input */
in = *pSrc++;
/* Calculate absolute value of input and then store the result in the destination buffer. */
*pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicAbs group
*/

View File

@@ -0,0 +1,118 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_abs_q31.c
* Description: Q31 vector absolute value
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup BasicAbs
* @{
*/
/**
* @brief Q31 vector absolute value.
* @param[in] *pSrc points to the input buffer
* @param[out] *pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
void arm_abs_q31(
q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
q31_t in; /* Input value */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2, in3, in4;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
*pDst++ = (in1 > 0) ? in1 : (q31_t)__QSUB(0, in1);
*pDst++ = (in2 > 0) ? in2 : (q31_t)__QSUB(0, in2);
*pDst++ = (in3 > 0) ? in3 : (q31_t)__QSUB(0, in3);
*pDst++ = (in4 > 0) ? in4 : (q31_t)__QSUB(0, in4);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute value of the input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
in = *pSrc++;
*pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of BasicAbs group
*/

View File

@@ -0,0 +1,145 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_abs_q7.c
* Description: Q7 vector absolute value
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup BasicAbs
* @{
*/
/**
* @brief Q7 vector absolute value.
* @param[in] *pSrc points to the input buffer
* @param[out] *pDst points to the output buffer
* @param[in] blockSize number of samples in each vector
* @return none.
*
* \par Conditions for optimum performance
* Input and output buffers should be aligned by 32-bit
*
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
*/
void arm_abs_q7(
q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
q7_t in; /* Input value1 */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2, in3, in4; /* temporary input variables */
q31_t out1, out2, out3, out4; /* temporary output variables */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = |A| */
/* Read inputs */
in1 = (q31_t) * pSrc;
in2 = (q31_t) * (pSrc + 1);
in3 = (q31_t) * (pSrc + 2);
/* find absolute value */
out1 = (in1 > 0) ? in1 : (q31_t)__QSUB8(0, in1);
/* read input */
in4 = (q31_t) * (pSrc + 3);
/* find absolute value */
out2 = (in2 > 0) ? in2 : (q31_t)__QSUB8(0, in2);
/* store result to destination */
*pDst = (q7_t) out1;
/* find absolute value */
out3 = (in3 > 0) ? in3 : (q31_t)__QSUB8(0, in3);
/* find absolute value */
out4 = (in4 > 0) ? in4 : (q31_t)__QSUB8(0, in4);
/* store result to destination */
*(pDst + 1) = (q7_t) out2;
/* store result to destination */
*(pDst + 2) = (q7_t) out3;
/* store result to destination */
*(pDst + 3) = (q7_t) out4;
/* update pointers to process next samples */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
blkCnt = blockSize;
#endif /* #define ARM_MATH_CM0_FAMILY */
while (blkCnt > 0U)
{
/* C = |A| */
/* Read the input */
in = *pSrc++;
/* Store the Absolute result in the destination buffer */
*pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? 0x7f : -in);
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of BasicAbs group
*/

View File

@@ -0,0 +1,169 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_f16.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicAdd Vector Addition
Element-by-element addition of two vectors.
<pre>
pDst[n] = pSrcA[n] + pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicAdd
@{
*/
/**
@brief Floating-point vector addition.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_add_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t vec2;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
res = vaddq(vec1, vec2);
vst1q(pDst, res);
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
/* C = A + B */
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
vstrhq_p(pDst, vaddq(vec1,vec2), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_add_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) + (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_FLOAT16_SUPPORTED) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicAdd group
*/

View File

@@ -0,0 +1,138 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_f32.c
* Description: Floating-point vector addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @defgroup BasicAdd Vector Addition
*
* Element-by-element addition of two vectors.
*
* <pre>
* pDst[n] = pSrcA[n] + pSrcB[n], 0 <= n < blockSize.
* </pre>
*
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup BasicAdd
* @{
*/
/**
* @brief Floating-point vector addition.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*/
void arm_add_f32(
float32_t * pSrcA,
float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t inA1, inA2, inA3, inA4; /* temporary input variabels */
float32_t inB1, inB2, inB3, inB4; /* temporary input variables */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
/* read four inputs from sourceA and four inputs from sourceB */
inA1 = *pSrcA;
inB1 = *pSrcB;
inA2 = *(pSrcA + 1);
inB2 = *(pSrcB + 1);
inA3 = *(pSrcA + 2);
inB3 = *(pSrcB + 2);
inA4 = *(pSrcA + 3);
inB4 = *(pSrcB + 3);
/* C = A + B */
/* add and store result to destination */
*pDst = inA1 + inB1;
*(pDst + 1) = inA2 + inB2;
*(pDst + 2) = inA3 + inB3;
*(pDst + 3) = inA4 + inB4;
/* update pointers to process next samples */
pSrcA += 4U;
pSrcB += 4U;
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (*pSrcA++) + (*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of BasicAdd group
*/

View File

@@ -0,0 +1,75 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_f64.c
* Description: Floating-point vector addition
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicAdd
@{
*/
/**
@brief Floating-point vector addition.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_add_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and store result in destination buffer. */
*pDst++ = (*pSrcA++) + (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicAdd group
*/

View File

@@ -0,0 +1,128 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_q15.c
* Description: Q15 vector addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup BasicAdd
* @{
*/
/**
* @brief Q15 vector addition.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
*/
void arm_add_q15(
q15_t * pSrcA,
q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inB1, inB2;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
inA1 = *__SIMD32(pSrcA)++;
inA2 = *__SIMD32(pSrcA)++;
inB1 = *__SIMD32(pSrcB)++;
inB2 = *__SIMD32(pSrcB)++;
*__SIMD32(pDst)++ = __QADD16(inA1, inB1);
*__SIMD32(pDst)++ = __QADD16(inA2, inB2);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ + *pSrcB++), 16);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicAdd group
*/

View File

@@ -0,0 +1,136 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_q31.c
* Description: Q31 vector addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup BasicAdd
* @{
*/
/**
* @brief Q31 vector addition.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
*/
void arm_add_q31(
q31_t * pSrcA,
q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inA3, inA4;
q31_t inB1, inB2, inB3, inB4;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
inA1 = *pSrcA++;
inA2 = *pSrcA++;
inB1 = *pSrcB++;
inB2 = *pSrcB++;
inA3 = *pSrcA++;
inA4 = *pSrcA++;
inB3 = *pSrcB++;
inB4 = *pSrcB++;
*pDst++ = __QADD(inA1, inB1);
*pDst++ = __QADD(inA2, inB2);
*pDst++ = __QADD(inA3, inB3);
*pDst++ = __QADD(inA4, inB4);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = __QADD(*pSrcA++, *pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrcA++ + *pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicAdd group
*/

View File

@@ -0,0 +1,122 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_add_q7.c
* Description: Q7 vector addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup BasicAdd
* @{
*/
/**
* @brief Q7 vector addition.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
*/
void arm_add_q7(
q7_t * pSrcA,
q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*__SIMD32(pDst)++ = __QADD8(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q7_t) __SSAT(*pSrcA++ + *pSrcB++, 8);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) * pSrcA++ + *pSrcB++, 8);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicAdd group
*/

View File

@@ -0,0 +1,137 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_and_u16.c
* Description: uint16_t bitwise AND
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup And Vector bitwise AND
Compute the logical bitwise AND.
There are separate functions for uint32_t, uint16_t, and uint7_t data types.
*/
/**
@addtogroup And
@{
*/
/**
@brief Compute the logical bitwise AND of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_and_u16(
const uint16_t * pSrcA,
const uint16_t * pSrcB,
uint16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecSrcA, vecSrcB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vandq_u16(vecSrcA, vecSrcB) );
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrhq_p(pDst, vandq_u16(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecA, vecB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
vecA = vld1q_u16(pSrcA);
vecB = vld1q_u16(pSrcB);
vst1q_u16(pDst, vandq_u16(vecA, vecB) );
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)&(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of And group
*/

View File

@@ -0,0 +1,129 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_and_u32.c
* Description: uint32_t bitwise AND
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup And
@{
*/
/**
@brief Compute the logical bitwise AND of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_and_u32(
const uint32_t * pSrcA,
const uint32_t * pSrcB,
uint32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecSrcA, vecSrcB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vandq_u32(vecSrcA, vecSrcB) );
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrwq_p(pDst, vandq_u32(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecA, vecB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecA = vld1q_u32(pSrcA);
vecB = vld1q_u32(pSrcB);
vst1q_u32(pDst, vandq_u32(vecA, vecB) );
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)&(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of And group
*/

View File

@@ -0,0 +1,130 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_and_u8.c
* Description: uint8_t bitwise AND
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup And
@{
*/
/**
@brief Compute the logical bitwise AND of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_and_u8(
const uint8_t * pSrcA,
const uint8_t * pSrcB,
uint8_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecSrcA, vecSrcB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vandq_u8(vecSrcA, vecSrcB) );
pSrcA += 16;
pSrcB += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrbq_p(pDst, vandq_u8(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecA, vecB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4U;
while (blkCnt > 0U)
{
vecA = vld1q_u8(pSrcA);
vecB = vld1q_u8(pSrcB);
vst1q_u8(pDst, vandq_u8(vecA, vecB) );
pSrcA += 16;
pSrcB += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)&(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of And group
*/

View File

@@ -0,0 +1,141 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_f16.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise floating-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_f16(const float16_t * pSrc,
float16_t * pDst,
float16_t low,
float16_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
f16x8_t curVec0, curVec1;
f16x8_t vecLow, vecHigh;
vecLow = vdupq_n_f16(low);
vecHigh = vdupq_n_f16(high);
curVec0 = vld1q(pSrc);
pSrc += 8;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 4;
while (blkCnt--)
{
curVec0 = vmaxnmq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 8;
curVec0 = vminnmq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 8;
curVec1 = vmaxnmq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 8;
curVec1 = vminnmq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 8;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 4) << 4);
if (blkCnt >= 8)
{
curVec0 = vmaxnmq(curVec0, vecLow);
curVec0 = vminnmq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 8;
curVec0 = vld1q(pSrc);
pSrc += 8;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp16q(blkCnt & 7);
curVec0 = vmaxnmq(curVec0, vecLow);
curVec0 = vminnmq(curVec0, vecHigh);
vstrhq_p(pDst, curVec0, p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_clip_f16(const float16_t * pSrc,
float16_t * pDst,
float16_t low,
float16_t high,
uint32_t numSamples)
{
for (uint32_t i = 0; i < numSamples; i++)
{
if ((_Float16)pSrc[i] > (_Float16)high)
pDst[i] = high;
else if ((_Float16)pSrc[i] < (_Float16)low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_FLOAT16_SUPPORTED */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,144 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_f32.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicClip Elementwise clipping
Element-by-element clipping of a value.
The value is constrained between 2 bounds.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise floating-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_f32(const float32_t * pSrc,
float32_t * pDst,
float32_t low,
float32_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
f32x4_t curVec0, curVec1;
f32x4_t vecLow, vecHigh;
vecLow = vdupq_n_f32(low);
vecHigh = vdupq_n_f32(high);
curVec0 = vld1q(pSrc);
pSrc += 4;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 3;
while (blkCnt--)
{
curVec0 = vmaxnmq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 4;
curVec0 = vminnmq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 4;
curVec1 = vmaxnmq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 4;
curVec1 = vminnmq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 4;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 3) << 3);
if (blkCnt >= 4)
{
curVec0 = vmaxnmq(curVec0, vecLow);
curVec0 = vminnmq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 4;
curVec0 = vld1q(pSrc);
pSrc += 4;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt & 3);
curVec0 = vmaxnmq(curVec0, vecLow);
curVec0 = vminnmq(curVec0, vecHigh);
vstrwq_p(pDst, curVec0, p0);
}
}
#else
void arm_clip_f32(const float32_t * pSrc,
float32_t * pDst,
float32_t low,
float32_t high,
uint32_t numSamples)
{
uint32_t i;
for (i = 0; i < numSamples; i++)
{
if (pSrc[i] > high)
pDst[i] = high;
else if (pSrc[i] < low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,134 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_q15.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise fixed-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_q15(const q15_t * pSrc,
q15_t * pDst,
q15_t low,
q15_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
q15x8_t curVec0, curVec1;
q15x8_t vecLow, vecHigh;
vecLow = vdupq_n_s16(low);
vecHigh = vdupq_n_s16(high);
curVec0 = vld1q(pSrc);
pSrc += 8;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 4;
while (blkCnt--)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 8;
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 8;
curVec1 = vmaxq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 8;
curVec1 = vminq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 8;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 4) << 4);
if (blkCnt >= 8)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 8;
curVec0 = vld1q(pSrc);
pSrc += 8;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp16q(blkCnt & 7);
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vstrhq_p(pDst, curVec0, p0);
}
}
#else
void arm_clip_q15(const q15_t * pSrc,
q15_t * pDst,
q15_t low,
q15_t high,
uint32_t numSamples)
{
uint32_t i;
for (i = 0; i < numSamples; i++)
{
if (pSrc[i] > high)
pDst[i] = high;
else if (pSrc[i] < low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,134 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_q31.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise fixed-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_q31(const q31_t * pSrc,
q31_t * pDst,
q31_t low,
q31_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
q31x4_t curVec0, curVec1;
q31x4_t vecLow, vecHigh;
vecLow = vdupq_n_s32(low);
vecHigh = vdupq_n_s32(high);
curVec0 = vld1q(pSrc);
pSrc += 4;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 3;
while (blkCnt--)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 4;
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 4;
curVec1 = vmaxq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 4;
curVec1 = vminq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 4;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 3) << 3);
if (blkCnt >= 4)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 4;
curVec0 = vld1q(pSrc);
pSrc += 4;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp32q(blkCnt & 3);
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vstrwq_p(pDst, curVec0, p0);
}
}
#else
void arm_clip_q31(const q31_t * pSrc,
q31_t * pDst,
q31_t low,
q31_t high,
uint32_t numSamples)
{
uint32_t i;
for (i = 0; i < numSamples; i++)
{
if (pSrc[i] > high)
pDst[i] = high;
else if (pSrc[i] < low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,134 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_clip_q7.c
* Description: Floating-point vector addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicClip
@{
*/
/**
@brief Elementwise fixed-point clipping
@param[in] pSrc points to input values
@param[out] pDst points to output clipped values
@param[in] low lower bound
@param[in] high higher bound
@param[in] numSamples number of samples to clip
@return none
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_clip_q7(const q7_t * pSrc,
q7_t * pDst,
q7_t low,
q7_t high,
uint32_t numSamples)
{
uint32_t blkCnt;
q7x16_t curVec0, curVec1;
q7x16_t vecLow, vecHigh;
vecLow = vdupq_n_s8(low);
vecHigh = vdupq_n_s8(high);
curVec0 = vld1q(pSrc);
pSrc += 16;
/*
* unrolled x 2 to allow
* vldr/vstr/vmin/vmax
* stall free interleaving
*/
blkCnt = numSamples >> 5;
while (blkCnt--)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec1 = vld1q(pSrc);
pSrc += 16;
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 16;
curVec1 = vmaxq(curVec1, vecLow);
curVec0 = vld1q(pSrc);
pSrc += 16;
curVec1 = vminq(curVec1, vecHigh);
vst1q(pDst, curVec1);
pDst += 16;
}
/*
* Tail handling
*/
blkCnt = numSamples - ((numSamples >> 5) << 5);
if (blkCnt >= 16)
{
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vst1q(pDst, curVec0);
pDst += 16;
curVec0 = vld1q(pSrc);
pSrc += 16;
}
if (blkCnt > 0)
{
mve_pred16_t p0 = vctp8q(blkCnt & 0xf);
curVec0 = vmaxq(curVec0, vecLow);
curVec0 = vminq(curVec0, vecHigh);
vstrbq_p(pDst, curVec0, p0);
}
}
#else
void arm_clip_q7(const q7_t * pSrc,
q7_t * pDst,
q7_t low,
q7_t high,
uint32_t numSamples)
{
uint32_t i;
for (i = 0; i < numSamples; i++)
{
if (pSrc[i] > high)
pDst[i] = high;
else if (pSrc[i] < low)
pDst[i] = low;
else
pDst[i] = pSrc[i];
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of BasicClip group
*/

View File

@@ -0,0 +1,184 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_dot_prod_f16.c
* Description: Floating-point dot product
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicDotProd Vector Dot Product
Computes the dot product of two vectors.
The vectors are multiplied element-by-element and then summed.
<pre>
sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicDotProd
@{
*/
/**
@brief Dot product of floating-point vectors.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[in] blockSize number of samples in each vector.
@param[out] result output result returned here.
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_dot_prod_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
uint32_t blockSize,
float16_t * result)
{
f16x8_t vecA, vecB;
f16x8_t vecSum;
uint32_t blkCnt;
float16_t sum = 0.0f;
vecSum = vdupq_n_f16(0.0f);
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/*
* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
* Calculate dot product and then store the result in a temporary buffer.
* and advance vector source and destination pointers
*/
vecA = vld1q(pSrcA);
pSrcA += 8;
vecB = vld1q(pSrcB);
pSrcB += 8;
vecSum = vfmaq(vecSum, vecA, vecB);
/*
* Decrement the blockSize loop counter
*/
blkCnt --;
}
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcA);
vecB = vld1q(pSrcB);
vecSum = vfmaq_m(vecSum, vecA, vecB, p0);
}
sum = vecAddAcrossF16Mve(vecSum);
/* Store result in destination buffer */
*result = sum;
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_dot_prod_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
uint32_t blockSize,
float16_t * result)
{
uint32_t blkCnt; /* Loop counter */
_Float16 sum = 0.0f; /* Temporary return variable */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer */
*result = sum;
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicDotProd group
*/

View File

@@ -0,0 +1,123 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_dot_prod_f32.c
* Description: Floating-point dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @defgroup dot_prod Vector Dot Product
*
* Computes the dot product of two vectors.
* The vectors are multiplied element-by-element and then summed.
*
* <pre>
* sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
* </pre>
*
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup dot_prod
* @{
*/
/**
* @brief Dot product of floating-point vectors.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] *result output result returned here
* @return none.
*/
void arm_dot_prod_f32(
float32_t * pSrcA,
float32_t * pSrcB,
uint32_t blockSize,
float32_t * result)
{
float32_t sum = 0.0f; /* Temporary result storage */
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer */
sum += (*pSrcA++) * (*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++);
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
/* Store the result back in the destination buffer */
*result = sum;
}
/**
* @} end of dot_prod group
*/

View File

@@ -0,0 +1,78 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_dot_prod_f64.c
* Description: Floating-point dot product
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicDotProd
@{
*/
/**
@brief Dot product of floating-point vectors.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[in] blockSize number of samples in each vector.
@param[out] result output result returned here.
@return none
*/
void arm_dot_prod_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
uint32_t blockSize,
float64_t * result)
{
uint32_t blkCnt; /* Loop counter */
float64_t sum = 0.; /* Temporary return variable */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and store result in a temporary buffer. */
sum += (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Store result in destination buffer */
*result = sum;
}
/**
@} end of BasicDotProd group
*/

View File

@@ -0,0 +1,128 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_dot_prod_q15.c
* Description: Q15 dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup dot_prod
* @{
*/
/**
* @brief Dot product of Q15 vectors.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] *result output result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
* results are added to a 64-bit accumulator in 34.30 format.
* Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
* there is no risk of overflow.
* The return result is in 34.30 format.
*/
void arm_dot_prod_q15(
q15_t * pSrcA,
q15_t * pSrcB,
uint32_t blockSize,
q63_t * result)
{
q63_t sum = 0; /* Temporary result storage */
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);
sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the results in a temporary buffer. */
sum = __SMLALD(*pSrcA++, *pSrcB++, sum);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the results in a temporary buffer. */
sum += (q63_t) ((q31_t) * pSrcA++ * *pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the result in the destination buffer in 34.30 format */
*result = sum;
}
/**
* @} end of dot_prod group
*/

View File

@@ -0,0 +1,131 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_dot_prod_q31.c
* Description: Q31 dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup dot_prod
* @{
*/
/**
* @brief Dot product of Q31 vectors.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] *result output result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these
* are truncated to 2.48 format by discarding the lower 14 bits.
* The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
* There are 15 guard bits in the accumulator and there is no risk of overflow as long as
* the length of the vectors is less than 2^16 elements.
* The return result is in 16.48 format.
*/
void arm_dot_prod_q31(
q31_t * pSrcA,
q31_t * pSrcB,
uint32_t blockSize,
q63_t * result)
{
q63_t sum = 0; /* Temporary result storage */
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inA3, inA4;
q31_t inB1, inB2, inB3, inB4;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
inA1 = *pSrcA++;
inA2 = *pSrcA++;
inA3 = *pSrcA++;
inA4 = *pSrcA++;
inB1 = *pSrcB++;
inB2 = *pSrcB++;
inB3 = *pSrcB++;
inB4 = *pSrcB++;
sum += ((q63_t) inA1 * inB1) >> 14U;
sum += ((q63_t) inA2 * inB2) >> 14U;
sum += ((q63_t) inA3 * inB3) >> 14U;
sum += ((q63_t) inA4 * inB4) >> 14U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Calculate dot product and then store the result in a temporary buffer. */
sum += ((q63_t) * pSrcA++ * *pSrcB++) >> 14U;
/* Decrement the loop counter */
blkCnt--;
}
/* Store the result in the destination buffer in 16.48 format */
*result = sum;
}
/**
* @} end of dot_prod group
*/

View File

@@ -0,0 +1,147 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_dot_prod_q7.c
* Description: Q7 dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup dot_prod
* @{
*/
/**
* @brief Dot product of Q7 vectors.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[in] blockSize number of samples in each vector
* @param[out] *result output result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these
* results are added to an accumulator in 18.14 format.
* Nonsaturating additions are used and there is no danger of wrap around as long as
* the vectors are less than 2^18 elements long.
* The return result is in 18.14 format.
*/
void arm_dot_prod_q7(
q7_t * pSrcA,
q7_t * pSrcB,
uint32_t blockSize,
q31_t * result)
{
uint32_t blkCnt; /* loop counter */
q31_t sum = 0; /* Temporary variables to store output */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t input1, input2; /* Temporary variables to store input */
q31_t inA1, inA2, inB1, inB2; /* Temporary variables to store input */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* read 4 samples at a time from sourceA */
input1 = *__SIMD32(pSrcA)++;
/* read 4 samples at a time from sourceB */
input2 = *__SIMD32(pSrcB)++;
/* extract two q7_t samples to q15_t samples */
inA1 = __SXTB16(__ROR(input1, 8));
/* extract reminaing two samples */
inA2 = __SXTB16(input1);
/* extract two q7_t samples to q15_t samples */
inB1 = __SXTB16(__ROR(input2, 8));
/* extract reminaing two samples */
inB2 = __SXTB16(input2);
/* multiply and accumulate two samples at a time */
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Dot product and then store the results in a temporary buffer. */
sum = __SMLAD(*pSrcA++, *pSrcB++, sum);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
/* Dot product and then store the results in a temporary buffer. */
sum += (q31_t) ((q15_t) * pSrcA++ * *pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the result in the destination buffer in 18.14 format */
*result = sum;
}
/**
* @} end of dot_prod group
*/

View File

@@ -0,0 +1,171 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mult_f16.c
* Description: Floating-point vector multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicMult Vector Multiplication
Element-by-element multiplication of two vectors.
<pre>
pDst[n] = pSrcA[n] * pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicMult
@{
*/
/**
@brief Floating-point vector multiplication.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_mult_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t vec2;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
res = vmulq(vec1, vec2);
vst1q(pDst, res);
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
/* C = A + B */
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
vstrhq_p(pDst, vmulq(vec1,vec2), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_mult_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply inputs and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply input and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicMult group
*/

View File

@@ -0,0 +1,162 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mult_f32.c
* Description: Floating-point vector multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @defgroup BasicMult Vector Multiplication
*
* Element-by-element multiplication of two vectors.
*
* <pre>
* pDst[n] = pSrcA[n] * pSrcB[n], 0 <= n < blockSize.
* </pre>
*
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup BasicMult
* @{
*/
/**
* @brief Floating-point vector multiplication.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*/
void arm_mult_f32(
float32_t * pSrcA,
float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t inA1, inA2, inA3, inA4; /* temporary input variables */
float32_t inB1, inB2, inB3, inB4; /* temporary input variables */
float32_t out1, out2, out3, out4; /* temporary output variables */
/* loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the results in output buffer */
/* read sample from sourceA */
inA1 = *pSrcA;
/* read sample from sourceB */
inB1 = *pSrcB;
/* read sample from sourceA */
inA2 = *(pSrcA + 1);
/* read sample from sourceB */
inB2 = *(pSrcB + 1);
/* out = sourceA * sourceB */
out1 = inA1 * inB1;
/* read sample from sourceA */
inA3 = *(pSrcA + 2);
/* read sample from sourceB */
inB3 = *(pSrcB + 2);
/* out = sourceA * sourceB */
out2 = inA2 * inB2;
/* read sample from sourceA */
inA4 = *(pSrcA + 3);
/* store result to destination buffer */
*pDst = out1;
/* read sample from sourceB */
inB4 = *(pSrcB + 3);
/* out = sourceA * sourceB */
out3 = inA3 * inB3;
/* store result to destination buffer */
*(pDst + 1) = out2;
/* out = sourceA * sourceB */
out4 = inA4 * inB4;
/* store result to destination buffer */
*(pDst + 2) = out3;
/* store result to destination buffer */
*(pDst + 3) = out4;
/* update pointers to process next samples */
pSrcA += 4U;
pSrcB += 4U;
pDst += 4U;
/* Decrement the blockSize loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the results in output buffer */
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* Decrement the blockSize loop counter */
blkCnt--;
}
}
/**
* @} end of BasicMult group
*/

View File

@@ -0,0 +1,75 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mult_f64.c
* Description: Floating-point vector multiplication
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicMult
@{
*/
/**
@brief Floating-point vector multiplication.
@param[in] pSrcA points to the first input vector.
@param[in] pSrcB points to the second input vector.
@param[out] pDst points to the output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
void arm_mult_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply input and store result in destination buffer. */
*pDst++ = (*pSrcA++) * (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicMult group
*/

View File

@@ -0,0 +1,142 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mult_q15.c
* Description: Q15 vector multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup BasicMult
* @{
*/
/**
* @brief Q15 vector multiplication
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
*/
void arm_mult_q15(
q15_t * pSrcA,
q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inB1, inB2; /* temporary input variables */
q15_t out1, out2, out3, out4; /* temporary output variables */
q31_t mul1, mul2, mul3, mul4; /* temporary variables */
/* loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* read two samples at a time from sourceA */
inA1 = *__SIMD32(pSrcA)++;
/* read two samples at a time from sourceB */
inB1 = *__SIMD32(pSrcB)++;
/* read two samples at a time from sourceA */
inA2 = *__SIMD32(pSrcA)++;
/* read two samples at a time from sourceB */
inB2 = *__SIMD32(pSrcB)++;
/* multiply mul = sourceA * sourceB */
mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
mul2 = (q31_t) ((q15_t) inA1 * (q15_t) inB1);
mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
mul4 = (q31_t) ((q15_t) inA2 * (q15_t) inB2);
/* saturate result to 16 bit */
out1 = (q15_t) __SSAT(mul1 >> 15, 16);
out2 = (q15_t) __SSAT(mul2 >> 15, 16);
out3 = (q15_t) __SSAT(mul3 >> 15, 16);
out4 = (q15_t) __SSAT(mul4 >> 15, 16);
/* store the result */
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
*__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
#else
*__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
*__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Decrement the blockSize loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the result in the destination buffer */
*pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
/* Decrement the blockSize loop counter */
blkCnt--;
}
}
/**
* @} end of BasicMult group
*/

View File

@@ -0,0 +1,148 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mult_q31.c
* Description: Q31 vector multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup BasicMult
* @{
*/
/**
* @brief Q31 vector multiplication.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
*/
void arm_mult_q31(
q31_t * pSrcA,
q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inA3, inA4; /* temporary input variables */
q31_t inB1, inB2, inB3, inB4; /* temporary input variables */
q31_t out1, out2, out3, out4; /* temporary output variables */
/* loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and then store the results in the destination buffer. */
inA1 = *pSrcA++;
inA2 = *pSrcA++;
inA3 = *pSrcA++;
inA4 = *pSrcA++;
inB1 = *pSrcB++;
inB2 = *pSrcB++;
inB3 = *pSrcB++;
inB4 = *pSrcB++;
out1 = ((q63_t) inA1 * inB1) >> 32;
out2 = ((q63_t) inA2 * inB2) >> 32;
out3 = ((q63_t) inA3 * inB3) >> 32;
out4 = ((q63_t) inA4 * inB4) >> 32;
out1 = __SSAT(out1, 31);
out2 = __SSAT(out2, 31);
out3 = __SSAT(out3, 31);
out4 = __SSAT(out4, 31);
*pDst++ = out1 << 1U;
*pDst++ = out2 << 1U;
*pDst++ = out3 << 1U;
*pDst++ = out4 << 1U;
/* Decrement the blockSize loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and then store the results in the destination buffer. */
inA1 = *pSrcA++;
inB1 = *pSrcB++;
out1 = ((q63_t) inA1 * inB1) >> 32;
out1 = __SSAT(out1, 31);
*pDst++ = out1 << 1U;
/* Decrement the blockSize loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and then store the results in the destination buffer. */
*pDst++ =
(q31_t) clip_q63_to_q31(((q63_t) (*pSrcA++) * (*pSrcB++)) >> 31);
/* Decrement the blockSize loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicMult group
*/

View File

@@ -0,0 +1,115 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mult_q7.c
* Description: Q7 vector multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup BasicMult
* @{
*/
/**
* @brief Q7 vector multiplication
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
*/
void arm_mult_q7(
q7_t * pSrcA,
q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counters */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q7_t out1, out2, out3, out4; /* Temporary variables to store the product */
/* loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the results in temporary variables */
out1 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out2 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out3 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
out4 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
/* Store the results of 4 inputs in the destination buffer in single cycle by packing */
*__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
/* Decrement the blockSize loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A * B */
/* Multiply the inputs and store the result in the destination buffer */
*pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
/* Decrement the blockSize loop counter */
blkCnt--;
}
}
/**
* @} end of BasicMult group
*/

View File

@@ -0,0 +1,166 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_negate_f16.c
* Description: Negates floating-point vectors
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicNegate Vector Negate
Negates the elements of a vector.
<pre>
pDst[n] = -pSrc[n], 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicNegate
@{
*/
/**
@brief Negates the elements of a floating-point vector.
@param[in] pSrc points to input vector.
@param[out] pDst points to output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_negate_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = |A| */
/* Calculate absolute values and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vnegq(vec1);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
/* C = |A| */
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q((float16_t const *) pSrc);
vstrhq_p(pDst, vnegq(vec1), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_negate_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = -(_Float16)*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
*pDst++ = -(_Float16)*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicNegate group
*/

View File

@@ -0,0 +1,134 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_negate_f32.c
* Description: Negates floating-point vectors
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @defgroup negate Vector Negate
*
* Negates the elements of a vector.
*
* <pre>
* pDst[n] = -pSrc[n], 0 <= n < blockSize.
* </pre>
*
* The functions support in-place computation allowing the source and
* destination pointers to reference the same memory buffer.
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup negate
* @{
*/
/**
* @brief Negates the elements of a floating-point vector.
* @param[in] *pSrc points to the input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*/
void arm_negate_f32(
float32_t * pSrc,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t in1, in2, in3, in4; /* temporary variables */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* read inputs from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
in4 = *(pSrc + 3);
/* negate the input */
in1 = -in1;
in2 = -in2;
in3 = -in3;
in4 = -in4;
/* store the result to destination */
*pDst = in1;
*(pDst + 1) = in2;
*(pDst + 2) = in3;
*(pDst + 3) = in4;
/* update pointers to process next samples */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the results in the destination buffer. */
*pDst++ = -*pSrc++;
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of negate group
*/

View File

@@ -0,0 +1,73 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_negate_f64.c
* Description: Negates floating-point vectors
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicNegate
@{
*/
/**
@brief Negates the elements of a floating-point vector.
@param[in] pSrc points to input vector.
@param[out] pDst points to output vector.
@param[in] blockSize number of samples in each vector.
@return none
*/
void arm_negate_f64(
const float64_t * pSrc,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and store result in destination buffer. */
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicNegate group
*/

View File

@@ -0,0 +1,131 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_negate_q15.c
* Description: Negates Q15 vectors
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup negate
* @{
*/
/**
* @brief Negates the elements of a Q15 vector.
* @param[in] *pSrc points to the input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* \par Conditions for optimum performance
* Input and output buffers should be aligned by 32-bit
*
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
*/
void arm_negate_q15(
q15_t * pSrc,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
q15_t in;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2; /* Temporary variables */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = -A */
/* Read two inputs at a time */
in1 = _SIMD32_OFFSET(pSrc);
in2 = _SIMD32_OFFSET(pSrc + 2);
/* negate two samples at a time */
in1 = __QSUB16(0, in1);
/* negate two samples at a time */
in2 = __QSUB16(0, in2);
/* store the result to destination 2 samples at a time */
_SIMD32_OFFSET(pDst) = in1;
/* store the result to destination 2 samples at a time */
_SIMD32_OFFSET(pDst + 2) = in2;
/* update pointers to process next samples */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the result in the destination buffer. */
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? 0x7fff : -in;
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of negate group
*/

View File

@@ -0,0 +1,117 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_negate_q31.c
* Description: Negates Q31 vectors
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup negate
* @{
*/
/**
* @brief Negates the elements of a Q31 vector.
* @param[in] *pSrc points to the input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
void arm_negate_q31(
q31_t * pSrc,
q31_t * pDst,
uint32_t blockSize)
{
q31_t in; /* Temporary variable */
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2, in3, in4;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the results in the destination buffer. */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
*pDst++ = __QSUB(0, in1);
*pDst++ = __QSUB(0, in2);
*pDst++ = __QSUB(0, in3);
*pDst++ = __QSUB(0, in4);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the result in the destination buffer. */
in = *pSrc++;
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of negate group
*/

View File

@@ -0,0 +1,113 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_negate_q7.c
* Description: Negates Q7 vectors
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup negate
* @{
*/
/**
* @brief Negates the elements of a Q7 vector.
* @param[in] *pSrc points to the input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.
*/
void arm_negate_q7(
q7_t * pSrc,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
q7_t in;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t input; /* Input values1-4 */
q31_t zero = 0x00000000;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = -A */
/* Read four inputs */
input = *__SIMD32(pSrc)++;
/* Store the Negated results in the destination buffer in a single cycle by packing the results */
*__SIMD32(pDst)++ = __QSUB8(zero, input);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = -A */
/* Negate and then store the results in the destination buffer. */ \
in = *pSrc++;
*pDst++ = (in == (q7_t) 0x80) ? 0x7f : -in;
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of negate group
*/

View File

@@ -0,0 +1,130 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_not_u16.c
* Description: uint16_t bitwise NOT
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup Not Vector bitwise NOT
Compute the logical bitwise NOT.
There are separate functions for uint32_t, uint16_t, and uint8_t data types.
*/
/**
@addtogroup Not
@{
*/
/**
@brief Compute the logical bitwise NOT of a fixed-point vector.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_not_u16(
const uint16_t * pSrc,
uint16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecSrc;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrc);
vst1q(pDst, vmvnq_u16(vecSrc) );
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrc = vld1q(pSrc);
vstrhq_p(pDst, vmvnq_u16(vecSrc), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t inV;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
inV = vld1q_u16(pSrc);
vst1q_u16(pDst, vmvnq_u16(inV) );
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = ~(*pSrc++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Not group
*/

View File

@@ -0,0 +1,122 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_not_u32.c
* Description: uint32_t bitwise NOT
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup Not
@{
*/
/**
@brief Compute the logical bitwise NOT of a fixed-point vector.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_not_u32(
const uint32_t * pSrc,
uint32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecSrc;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrc);
vst1q(pDst, vmvnq_u32(vecSrc) );
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrc = vld1q(pSrc);
vstrwq_p(pDst, vmvnq_u32(vecSrc), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t inV;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
inV = vld1q_u32(pSrc);
vst1q_u32(pDst, vmvnq_u32(inV) );
pSrc += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = ~(*pSrc++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Not group
*/

View File

@@ -0,0 +1,122 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_not_u8.c
* Description: uint8_t bitwise NOT
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup Not
@{
*/
/**
@brief Compute the logical bitwise NOT of a fixed-point vector.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_not_u8(
const uint8_t * pSrc,
uint8_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecSrc;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrc);
vst1q(pDst, vmvnq_u8(vecSrc) );
pSrc += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrc = vld1q(pSrc);
vstrbq_p(pDst, vmvnq_u8(vecSrc), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t inV;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4U;
while (blkCnt > 0U)
{
inV = vld1q_u8(pSrc);
vst1q_u8(pDst, vmvnq_u8(inV) );
pSrc += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = ~(*pSrc++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Not group
*/

View File

@@ -0,0 +1,170 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_offset_f16.c
* Description: Floating-point vector offset
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicOffset Vector Offset
Adds a constant offset to each element of a vector.
<pre>
pDst[n] = pSrc[n] + offset, 0 <= n < blockSize.
</pre>
The functions support in-place computation allowing the source and
destination pointers to reference the same memory buffer.
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicOffset
@{
*/
/**
@brief Adds a constant offset to a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_offset_f16(
const float16_t * pSrc,
float16_t offset,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vaddq(vec1,offset);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q((float16_t const *) pSrc);
vstrhq_p(pDst, vaddq(vec1, offset), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_offset_f16(
const float16_t * pSrc,
float16_t offset,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrc++) + (_Float16)offset;
/* Decrement loop counter */
blkCnt--;
}
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicOffset group
*/

View File

@@ -0,0 +1,154 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_offset_f32.c
* Description: Floating-point vector offset
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @defgroup offset Vector Offset
*
* Adds a constant offset to each element of a vector.
*
* <pre>
* pDst[n] = pSrc[n] + offset, 0 <= n < blockSize.
* </pre>
*
* The functions support in-place computation allowing the source and
* destination pointers to reference the same memory buffer.
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup offset
* @{
*/
/**
* @brief Adds a constant offset to a floating-point vector.
* @param[in] *pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*/
void arm_offset_f32(
float32_t * pSrc,
float32_t offset,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t in1, in2, in3, in4;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
/* read samples from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
/* add offset to input */
in1 = in1 + offset;
/* read samples from source */
in3 = *(pSrc + 2);
/* add offset to input */
in2 = in2 + offset;
/* read samples from source */
in4 = *(pSrc + 3);
/* add offset to input */
in3 = in3 + offset;
/* store result to destination */
*pDst = in1;
/* add offset to input */
in4 = in4 + offset;
/* store result to destination */
*(pDst + 1) = in2;
/* store result to destination */
*(pDst + 2) = in3;
/* store result to destination */
*(pDst + 3) = in4;
/* update pointers to process next samples */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
*pDst++ = (*pSrc++) + offset;
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of offset group
*/

View File

@@ -0,0 +1,75 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_offset_f64.c
* Description: Floating-point vector offset
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicOffset
@{
*/
/**
@brief Adds a constant offset to a floating-point vector.
@param[in] pSrc points to the input vector
@param[in] offset is the offset to be added
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_offset_f64(
const float64_t * pSrc,
float64_t offset,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and store result in destination buffer. */
*pDst++ = (*pSrc++) + offset;
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicOffset group
*/

View File

@@ -0,0 +1,124 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_offset_q15.c
* Description: Q15 vector offset
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup offset
* @{
*/
/**
* @brief Adds a constant offset to a Q15 vector.
* @param[in] *pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
void arm_offset_q15(
q15_t * pSrc,
q15_t offset,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t offset_packed; /* Offset packed to 32 bit */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* Offset is packed to 32 bit in order to use SIMD32 for addition */
offset_packed = __PKHBT(offset, offset, 16);
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer, 2 samples at a time. */
*__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);
*__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
*pDst++ = (q15_t) __QADD16(*pSrc++, offset);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
*pDst++ = (q15_t) __SSAT(((q31_t) * pSrc++ + offset), 16);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of offset group
*/

View File

@@ -0,0 +1,128 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_offset_q31.c
* Description: Q31 vector offset
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup offset
* @{
*/
/**
* @brief Adds a constant offset to a Q31 vector.
* @param[in] *pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
void arm_offset_q31(
q31_t * pSrc,
q31_t offset,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2, in3, in4;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
*pDst++ = __QADD(in1, offset);
*pDst++ = __QADD(in2, offset);
*pDst++ = __QADD(in3, offset);
*pDst++ = __QADD(in4, offset);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
*pDst++ = __QADD(*pSrc++, offset);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of offset group
*/

View File

@@ -0,0 +1,123 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_offset_q7.c
* Description: Q7 vector offset
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup offset
* @{
*/
/**
* @brief Adds a constant offset to a Q7 vector.
* @param[in] *pSrc points to the input vector
* @param[in] offset is the offset to be added
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x80 0x7F] are saturated.
*/
void arm_offset_q7(
q7_t * pSrc,
q7_t offset,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t offset_packed; /* Offset packed to 32 bit */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* Offset is packed to 32 bit in order to use SIMD32 for addition */
offset_packed = __PACKq7(offset, offset, offset, offset);
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination bufferfor 4 samples at a time. */
*__SIMD32(pDst)++ = __QADD8(*__SIMD32(pSrc)++, offset_packed);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
*pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the result in the destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) * pSrc++ + offset, 8);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of offset group
*/

View File

@@ -0,0 +1,137 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_or_u16.c
* Description: uint16_t bitwise inclusive OR
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup Or Vector bitwise inclusive OR
Compute the logical bitwise OR.
There are separate functions for uint32_t, uint16_t, and uint8_t data types.
*/
/**
@addtogroup Or
@{
*/
/**
@brief Compute the logical bitwise OR of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_or_u16(
const uint16_t * pSrcA,
const uint16_t * pSrcB,
uint16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecSrcA, vecSrcB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vorrq_u16(vecSrcA, vecSrcB) );
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrhq_p(pDst, vorrq_u16(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecA, vecB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
vecA = vld1q_u16(pSrcA);
vecB = vld1q_u16(pSrcB);
vst1q_u16(pDst, vorrq_u16(vecA, vecB) );
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)|(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Or group
*/

View File

@@ -0,0 +1,128 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_or_u32.c
* Description: uint32_t bitwise inclusive OR
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup Or
@{
*/
/**
@brief Compute the logical bitwise OR of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_or_u32(
const uint32_t * pSrcA,
const uint32_t * pSrcB,
uint32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecSrcA, vecSrcB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vorrq_u32(vecSrcA, vecSrcB) );
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrwq_p(pDst, vorrq_u32(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecA, vecB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecA = vld1q_u32(pSrcA);
vecB = vld1q_u32(pSrcB);
vst1q_u32(pDst, vorrq_u32(vecA, vecB) );
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)|(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Or group
*/

View File

@@ -0,0 +1,128 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_or_u8.c
* Description: uint8_t bitwise inclusive OR
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup Or
@{
*/
/**
@brief Compute the logical bitwise OR of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_or_u8(
const uint8_t * pSrcA,
const uint8_t * pSrcB,
uint8_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecSrcA, vecSrcB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, vorrq_u8(vecSrcA, vecSrcB) );
pSrcA += 16;
pSrcB += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrbq_p(pDst, vorrq_u8(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecA, vecB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4U;
while (blkCnt > 0U)
{
vecA = vld1q_u8(pSrcA);
vecB = vld1q_u8(pSrcB);
vst1q_u8(pDst, vorrq_u8(vecA, vecB) );
pSrcA += 16;
pSrcB += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)|(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Or group
*/

View File

@@ -0,0 +1,183 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_scale_f16.c
* Description: Multiplies a floating-point vector by a scalar
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicScale Vector Scale
Multiply a vector by a scalar value. For floating-point data, the algorithm used is:
<pre>
pDst[n] = pSrc[n] * scale, 0 <= n < blockSize.
</pre>
In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by
a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
The shift allows the gain of the scaling operation to exceed 1.0.
The algorithm used with fixed-point data is:
<pre>
pDst[n] = (pSrc[n] * scaleFract) << shift, 0 <= n < blockSize.
</pre>
The overall scale factor applied to the fixed-point data is
<pre>
scale = scaleFract * 2^shift.
</pre>
The functions support in-place computation allowing the source and destination
pointers to reference the same memory buffer.
*/
/**
@addtogroup BasicScale
@{
*/
/**
@brief Multiplies a floating-point vector by a scalar.
@param[in] pSrc points to the input vector
@param[in] scale scale factor to be applied
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_scale_f16(
const float16_t * pSrc,
float16_t scale,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = A + offset */
/* Add offset and then store the results in the destination buffer. */
vec1 = vld1q(pSrc);
res = vmulq(vec1,scale);
vst1q(pDst, res);
/* Increment pointers */
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q((float16_t const *) pSrc);
vstrhq_p(pDst, vmulq(vec1, scale), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_scale_f16(
const float16_t *pSrc,
float16_t scale,
float16_t *pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale input and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrc++) * (_Float16)scale;
*pDst++ = (_Float16)(*pSrc++) * (_Float16)scale;
*pDst++ = (_Float16)(*pSrc++) * (_Float16)scale;
*pDst++ = (_Float16)(*pSrc++) * (_Float16)scale;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale input and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrc++) * (_Float16)scale;
/* Decrement loop counter */
blkCnt--;
}
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicScale group
*/

View File

@@ -0,0 +1,157 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_scale_f32.c
* Description: Multiplies a floating-point vector by a scalar
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @defgroup scale Vector Scale
*
* Multiply a vector by a scalar value. For floating-point data, the algorithm used is:
*
* <pre>
* pDst[n] = pSrc[n] * scale, 0 <= n < blockSize.
* </pre>
*
* In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by
* a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
* The shift allows the gain of the scaling operation to exceed 1.0.
* The algorithm used with fixed-point data is:
*
* <pre>
* pDst[n] = (pSrc[n] * scaleFract) << shift, 0 <= n < blockSize.
* </pre>
*
* The overall scale factor applied to the fixed-point data is
* <pre>
* scale = scaleFract * 2^shift.
* </pre>
*
* The functions support in-place computation allowing the source and destination
* pointers to reference the same memory buffer.
*/
/**
* @addtogroup scale
* @{
*/
/**
* @brief Multiplies a floating-point vector by a scalar.
* @param[in] *pSrc points to the input vector
* @param[in] scale scale factor to be applied
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*/
void arm_scale_f32(
float32_t * pSrc,
float32_t scale,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t in1, in2, in3, in4; /* temporary variabels */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the results in the destination buffer. */
/* read input samples from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
/* multiply with scaling factor */
in1 = in1 * scale;
/* read input sample from source */
in3 = *(pSrc + 2);
/* multiply with scaling factor */
in2 = in2 * scale;
/* read input sample from source */
in4 = *(pSrc + 3);
/* multiply with scaling factor */
in3 = in3 * scale;
in4 = in4 * scale;
/* store the result to destination */
*pDst = in1;
*(pDst + 1) = in2;
*(pDst + 2) = in3;
*(pDst + 3) = in4;
/* update pointers to process next samples */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the result in the destination buffer. */
*pDst++ = (*pSrc++) * scale;
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of scale group
*/

View File

@@ -0,0 +1,75 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_scale_f64.c
* Description: Multiplies a floating-point vector by a scalar
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicScale
@{
*/
/**
@brief Multiplies a floating-point vector by a scalar.
@param[in] pSrc points to the input vector
@param[in] scale scale factor to be applied
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_scale_f64(
const float64_t *pSrc,
float64_t scale,
float64_t *pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale input and store result in destination buffer. */
*pDst++ = (*pSrc++) * scale;
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicScale group
*/

View File

@@ -0,0 +1,150 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_scale_q15.c
* Description: Multiplies a Q15 vector by a scalar
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup scale
* @{
*/
/**
* @brief Multiplies a Q15 vector by a scalar.
* @param[in] *pSrc points to the input vector
* @param[in] scaleFract fractional portion of the scale value
* @param[in] shift number of bits to shift the result by
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
* These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
*/
void arm_scale_q15(
q15_t * pSrc,
q15_t scaleFract,
int8_t shift,
q15_t * pDst,
uint32_t blockSize)
{
int8_t kShift = 15 - shift; /* shift to apply after scaling */
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q15_t in1, in2, in3, in4;
q31_t inA1, inA2; /* Temporary variables */
q31_t out1, out2, out3, out4;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* Reading 2 inputs from memory */
inA1 = *__SIMD32(pSrc)++;
inA2 = *__SIMD32(pSrc)++;
/* C = A * scale */
/* Scale the inputs and then store the 2 results in the destination buffer
* in single cycle by packing the outputs */
out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
out2 = (q31_t) ((q15_t) inA1 * scaleFract);
out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
out4 = (q31_t) ((q15_t) inA2 * scaleFract);
/* apply shifting */
out1 = out1 >> kShift;
out2 = out2 >> kShift;
out3 = out3 >> kShift;
out4 = out4 >> kShift;
/* saturate the output */
in1 = (q15_t) (__SSAT(out1, 16));
in2 = (q15_t) (__SSAT(out2, 16));
in3 = (q15_t) (__SSAT(out3, 16));
in4 = (q15_t) (__SSAT(out4, 16));
/* store the result to destination */
*__SIMD32(pDst)++ = __PKHBT(in2, in1, 16);
*__SIMD32(pDst)++ = __PKHBT(in4, in3, 16);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the result in the destination buffer. */
*pDst++ = (q15_t) (__SSAT(((*pSrc++) * scaleFract) >> kShift, 16));
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the result in the destination buffer. */
*pDst++ = (q15_t) (__SSAT(((q31_t) * pSrc++ * scaleFract) >> kShift, 16));
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of scale group
*/

View File

@@ -0,0 +1,227 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_scale_q31.c
* Description: Multiplies a Q31 vector by a scalar
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup scale
* @{
*/
/**
* @brief Multiplies a Q31 vector by a scalar.
* @param[in] *pSrc points to the input vector
* @param[in] scaleFract fractional portion of the scale value
* @param[in] shift number of bits to shift the result by
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
* These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
*/
void arm_scale_q31(
q31_t * pSrc,
q31_t scaleFract,
int8_t shift,
q31_t * pDst,
uint32_t blockSize)
{
int8_t kShift = shift + 1; /* Shift to apply after scaling */
int8_t sign = (kShift & 0x80);
uint32_t blkCnt; /* loop counter */
q31_t in, out;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t in1, in2, in3, in4; /* temporary input variables */
q31_t out1, out2, out3, out4; /* temporary output variabels */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
if (sign == 0U)
{
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* read four inputs from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
in4 = *(pSrc + 3);
/* multiply input with scaler value */
in1 = ((q63_t) in1 * scaleFract) >> 32;
in2 = ((q63_t) in2 * scaleFract) >> 32;
in3 = ((q63_t) in3 * scaleFract) >> 32;
in4 = ((q63_t) in4 * scaleFract) >> 32;
/* apply shifting */
out1 = in1 << kShift;
out2 = in2 << kShift;
/* saturate the results. */
if (in1 != (out1 >> kShift))
out1 = 0x7FFFFFFF ^ (in1 >> 31);
if (in2 != (out2 >> kShift))
out2 = 0x7FFFFFFF ^ (in2 >> 31);
out3 = in3 << kShift;
out4 = in4 << kShift;
*pDst = out1;
*(pDst + 1) = out2;
if (in3 != (out3 >> kShift))
out3 = 0x7FFFFFFF ^ (in3 >> 31);
if (in4 != (out4 >> kShift))
out4 = 0x7FFFFFFF ^ (in4 >> 31);
/* Store result destination */
*(pDst + 2) = out3;
*(pDst + 3) = out4;
/* Update pointers to process next sampels */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
}
else
{
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* read four inputs from source */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
in4 = *(pSrc + 3);
/* multiply input with scaler value */
in1 = ((q63_t) in1 * scaleFract) >> 32;
in2 = ((q63_t) in2 * scaleFract) >> 32;
in3 = ((q63_t) in3 * scaleFract) >> 32;
in4 = ((q63_t) in4 * scaleFract) >> 32;
/* apply shifting */
out1 = in1 >> -kShift;
out2 = in2 >> -kShift;
out3 = in3 >> -kShift;
out4 = in4 >> -kShift;
/* Store result destination */
*pDst = out1;
*(pDst + 1) = out2;
*(pDst + 2) = out3;
*(pDst + 3) = out4;
/* Update pointers to process next sampels */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
if (sign == 0)
{
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the result in the destination buffer. */
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pDst++ = out;
/* Decrement the loop counter */
blkCnt--;
}
}
else
{
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the result in the destination buffer. */
in = *pSrc++;
in = ((q63_t) in * scaleFract) >> 32;
out = in >> -kShift;
*pDst++ = out;
/* Decrement the loop counter */
blkCnt--;
}
}
}
/**
* @} end of scale group
*/

View File

@@ -0,0 +1,137 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_scale_q7.c
* Description: Multiplies a Q7 vector by a scalar
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup scale
* @{
*/
/**
* @brief Multiplies a Q7 vector by a scalar.
* @param[in] *pSrc points to the input vector
* @param[in] scaleFract fractional portion of the scale value
* @param[in] shift number of bits to shift the result by
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.7 format.
* These are multiplied to yield a 2.14 intermediate result and this is shifted with saturation to 1.7 format.
*/
void arm_scale_q7(
q7_t * pSrc,
q7_t scaleFract,
int8_t shift,
q7_t * pDst,
uint32_t blockSize)
{
int8_t kShift = 7 - shift; /* shift to apply after scaling */
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q7_t in1, in2, in3, in4, out1, out2, out3, out4; /* Temporary variables to store input & output */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* Reading 4 inputs from memory */
in1 = *pSrc++;
in2 = *pSrc++;
in3 = *pSrc++;
in4 = *pSrc++;
/* C = A * scale */
/* Scale the inputs and then store the results in the temporary variables. */
out1 = (q7_t) (__SSAT(((in1) * scaleFract) >> kShift, 8));
out2 = (q7_t) (__SSAT(((in2) * scaleFract) >> kShift, 8));
out3 = (q7_t) (__SSAT(((in3) * scaleFract) >> kShift, 8));
out4 = (q7_t) (__SSAT(((in4) * scaleFract) >> kShift, 8));
/* Packing the individual outputs into 32bit and storing in
* destination buffer in single write */
*__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the result in the destination buffer. */
*pDst++ = (q7_t) (__SSAT(((*pSrc++) * scaleFract) >> kShift, 8));
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A * scale */
/* Scale the input and then store the result in the destination buffer. */
*pDst++ = (q7_t) (__SSAT((((q15_t) * pSrc++ * scaleFract) >> kShift), 8));
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of scale group
*/

View File

@@ -0,0 +1,236 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_shift_q15.c
* Description: Shifts the elements of a Q15 vector by a specified number of bits
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup shift
* @{
*/
/**
* @brief Shifts the elements of a Q15 vector a specified number of bits.
* @param[in] *pSrc points to the input vector
* @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
*/
void arm_shift_q15(
q15_t * pSrc,
int8_t shiftBits,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint8_t sign; /* Sign of shiftBits */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q15_t in1, in2; /* Temporary variables */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* Getting the sign of shiftBits */
sign = (shiftBits & 0x80);
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* Read 2 inputs */
in1 = *pSrc++;
in2 = *pSrc++;
/* C = A << shiftBits */
/* Shift the inputs and then store the results in the destination buffer. */
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pDst)++ = __PKHBT(__SSAT((in1 << shiftBits), 16),
__SSAT((in2 << shiftBits), 16), 16);
#else
*__SIMD32(pDst)++ = __PKHBT(__SSAT((in2 << shiftBits), 16),
__SSAT((in1 << shiftBits), 16), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
in1 = *pSrc++;
in2 = *pSrc++;
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pDst)++ = __PKHBT(__SSAT((in1 << shiftBits), 16),
__SSAT((in2 << shiftBits), 16), 16);
#else
*__SIMD32(pDst)++ = __PKHBT(__SSAT((in2 << shiftBits), 16),
__SSAT((in1 << shiftBits), 16), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Shift and then store the results in the destination buffer. */
*pDst++ = __SSAT((*pSrc++ << shiftBits), 16);
/* Decrement the loop counter */
blkCnt--;
}
}
else
{
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* Read 2 inputs */
in1 = *pSrc++;
in2 = *pSrc++;
/* C = A >> shiftBits */
/* Shift the inputs and then store the results in the destination buffer. */
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pDst)++ = __PKHBT((in1 >> -shiftBits),
(in2 >> -shiftBits), 16);
#else
*__SIMD32(pDst)++ = __PKHBT((in2 >> -shiftBits),
(in1 >> -shiftBits), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
in1 = *pSrc++;
in2 = *pSrc++;
#ifndef ARM_MATH_BIG_ENDIAN
*__SIMD32(pDst)++ = __PKHBT((in1 >> -shiftBits),
(in2 >> -shiftBits), 16);
#else
*__SIMD32(pDst)++ = __PKHBT((in2 >> -shiftBits),
(in1 >> -shiftBits), 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Shift the inputs and then store the results in the destination buffer. */
*pDst++ = (*pSrc++ >> -shiftBits);
/* Decrement the loop counter */
blkCnt--;
}
}
#else
/* Run the below code for Cortex-M0 */
/* Getting the sign of shiftBits */
sign = (shiftBits & 0x80);
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Shift and then store the results in the destination buffer. */
*pDst++ = __SSAT(((q31_t) * pSrc++ << shiftBits), 16);
/* Decrement the loop counter */
blkCnt--;
}
}
else
{
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Shift the inputs and then store the results in the destination buffer. */
*pDst++ = (*pSrc++ >> -shiftBits);
/* Decrement the loop counter */
blkCnt--;
}
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of shift group
*/

View File

@@ -0,0 +1,191 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_shift_q31.c
* Description: Shifts the elements of a Q31 vector by a specified number of bits
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @defgroup shift Vector Shift
*
* Shifts the elements of a fixed-point vector by a specified number of bits.
* There are separate functions for Q7, Q15, and Q31 data types.
* The underlying algorithm used is:
*
* <pre>
* pDst[n] = pSrc[n] << shift, 0 <= n < blockSize.
* </pre>
*
* If <code>shift</code> is positive then the elements of the vector are shifted to the left.
* If <code>shift</code> is negative then the elements of the vector are shifted to the right.
*
* The functions support in-place computation allowing the source and destination
* pointers to reference the same memory buffer.
*/
/**
* @addtogroup shift
* @{
*/
/**
* @brief Shifts the elements of a Q31 vector a specified number of bits.
* @param[in] *pSrc points to the input vector
* @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
*/
void arm_shift_q31(
q31_t * pSrc,
int8_t shiftBits,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint8_t sign = (shiftBits & 0x80); /* Sign of shiftBits */
#if defined (ARM_MATH_DSP)
q31_t in1, in2, in3, in4; /* Temporary input variables */
q31_t out1, out2, out3, out4; /* Temporary output variables */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
if (sign == 0U)
{
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Shift the input and then store the results in the destination buffer. */
in1 = *pSrc;
in2 = *(pSrc + 1);
out1 = in1 << shiftBits;
in3 = *(pSrc + 2);
out2 = in2 << shiftBits;
in4 = *(pSrc + 3);
if (in1 != (out1 >> shiftBits))
out1 = 0x7FFFFFFF ^ (in1 >> 31);
if (in2 != (out2 >> shiftBits))
out2 = 0x7FFFFFFF ^ (in2 >> 31);
*pDst = out1;
out3 = in3 << shiftBits;
*(pDst + 1) = out2;
out4 = in4 << shiftBits;
if (in3 != (out3 >> shiftBits))
out3 = 0x7FFFFFFF ^ (in3 >> 31);
if (in4 != (out4 >> shiftBits))
out4 = 0x7FFFFFFF ^ (in4 >> 31);
*(pDst + 2) = out3;
*(pDst + 3) = out4;
/* Update destination pointer to process next sampels */
pSrc += 4U;
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
}
else
{
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Shift the input and then store the results in the destination buffer. */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
in4 = *(pSrc + 3);
*pDst = (in1 >> -shiftBits);
*(pDst + 1) = (in2 >> -shiftBits);
*(pDst + 2) = (in3 >> -shiftBits);
*(pDst + 3) = (in4 >> -shiftBits);
pSrc += 4U;
pDst += 4U;
blkCnt--;
}
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A (>> or <<) shiftBits */
/* Shift the input and then store the result in the destination buffer. */
*pDst++ = (sign == 0U) ? clip_q63_to_q31((q63_t) * pSrc++ << shiftBits) :
(*pSrc++ >> -shiftBits);
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of shift group
*/

View File

@@ -0,0 +1,208 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_shift_q7.c
* Description: Processing function for the Q7 Shifting
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup shift
* @{
*/
/**
* @brief Shifts the elements of a Q7 vector a specified number of bits.
* @param[in] *pSrc points to the input vector
* @param[in] shiftBits number of bits to shift. A positive value shifts left; a negative value shifts right.
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in the vector
* @return none.
*
* \par Conditions for optimum performance
* Input and output buffers should be aligned by 32-bit
*
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x8 0x7F] will be saturated.
*/
void arm_shift_q7(
q7_t * pSrc,
int8_t shiftBits,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
uint8_t sign; /* Sign of shiftBits */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q7_t in1; /* Input value1 */
q7_t in2; /* Input value2 */
q7_t in3; /* Input value3 */
q7_t in4; /* Input value4 */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* Getting the sign of shiftBits */
sign = (shiftBits & 0x80);
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Read 4 inputs */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
in4 = *(pSrc + 3);
/* Store the Shifted result in the destination buffer in single cycle by packing the outputs */
*__SIMD32(pDst)++ = __PACKq7(__SSAT((in1 << shiftBits), 8),
__SSAT((in2 << shiftBits), 8),
__SSAT((in3 << shiftBits), 8),
__SSAT((in4 << shiftBits), 8));
/* Update source pointer to process next sampels */
pSrc += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Shift the input and then store the result in the destination buffer. */
*pDst++ = (q7_t) __SSAT((*pSrc++ << shiftBits), 8);
/* Decrement the loop counter */
blkCnt--;
}
}
else
{
shiftBits = -shiftBits;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Read 4 inputs */
in1 = *pSrc;
in2 = *(pSrc + 1);
in3 = *(pSrc + 2);
in4 = *(pSrc + 3);
/* Store the Shifted result in the destination buffer in single cycle by packing the outputs */
*__SIMD32(pDst)++ = __PACKq7((in1 >> shiftBits), (in2 >> shiftBits),
(in3 >> shiftBits), (in4 >> shiftBits));
pSrc += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Shift the input and then store the result in the destination buffer. */
in1 = *pSrc++;
*pDst++ = (in1 >> shiftBits);
/* Decrement the loop counter */
blkCnt--;
}
}
#else
/* Run the below code for Cortex-M0 */
/* Getting the sign of shiftBits */
sign = (shiftBits & 0x80);
/* If the shift value is positive then do right shift else left shift */
if (sign == 0U)
{
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A << shiftBits */
/* Shift the input and then store the result in the destination buffer. */
*pDst++ = (q7_t) __SSAT(((q15_t) * pSrc++ << shiftBits), 8);
/* Decrement the loop counter */
blkCnt--;
}
}
else
{
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A >> shiftBits */
/* Shift the input and then store the result in the destination buffer. */
*pDst++ = (*pSrc++ >> -shiftBits);
/* Decrement the loop counter */
blkCnt--;
}
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of shift group
*/

View File

@@ -0,0 +1,171 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_sub_f16.c
* Description: Floating-point vector subtraction
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions_f16.h"
/**
@ingroup groupMath
*/
/**
@defgroup BasicSub Vector Subtraction
Element-by-element subtraction of two vectors.
<pre>
pDst[n] = pSrcA[n] - pSrcB[n], 0 <= n < blockSize.
</pre>
There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
@addtogroup BasicSub
@{
*/
/**
@brief Floating-point vector subtraction.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_sub_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
f16x8_t vec1;
f16x8_t vec2;
f16x8_t res;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
/* C = A + B */
/* Add and then store the results in the destination buffer. */
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
res = vsubq(vec1, vec2);
vst1q(pDst, res);
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0x7;
if (blkCnt > 0U)
{
/* C = A + B */
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vld1q(pSrcA);
vec2 = vld1q(pSrcB);
vstrhq_p(pDst, vsubq(vec1,vec2), p0);
}
}
#else
#if defined(ARM_FLOAT16_SUPPORTED)
void arm_sub_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) - (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) - (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) - (_Float16)(*pSrcB++);
*pDst++ = (_Float16)(*pSrcA++) - (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = blockSize % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and store result in destination buffer. */
*pDst++ = (_Float16)(*pSrcA++) - (_Float16)(*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of BasicSub group
*/

View File

@@ -0,0 +1,138 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_sub_f32.c
* Description: Floating-point vector subtraction.
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @defgroup BasicSub Vector Subtraction
*
* Element-by-element subtraction of two vectors.
*
* <pre>
* pDst[n] = pSrcA[n] - pSrcB[n], 0 <= n < blockSize.
* </pre>
*
* There are separate functions for floating-point, Q7, Q15, and Q31 data types.
*/
/**
* @addtogroup BasicSub
* @{
*/
/**
* @brief Floating-point vector subtraction.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*/
void arm_sub_f32(
float32_t * pSrcA,
float32_t * pSrcB,
float32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t inA1, inA2, inA3, inA4; /* temporary variables */
float32_t inB1, inB2, inB3, inB4; /* temporary variables */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the results in the destination buffer. */
/* Read 4 input samples from sourceA and sourceB */
inA1 = *pSrcA;
inB1 = *pSrcB;
inA2 = *(pSrcA + 1);
inB2 = *(pSrcB + 1);
inA3 = *(pSrcA + 2);
inB3 = *(pSrcB + 2);
inA4 = *(pSrcA + 3);
inB4 = *(pSrcB + 3);
/* dst = srcA - srcB */
/* subtract and store the result */
*pDst = inA1 - inB1;
*(pDst + 1) = inA2 - inB2;
*(pDst + 2) = inA3 - inB3;
*(pDst + 3) = inA4 - inB4;
/* Update pointers to process next sampels */
pSrcA += 4U;
pSrcB += 4U;
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the results in the destination buffer. */
*pDst++ = (*pSrcA++) - (*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of BasicSub group
*/

View File

@@ -0,0 +1,75 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_sub_f64.c
* Description: Floating-point vector subtraction
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup BasicSub
@{
*/
/**
@brief Floating-point vector subtraction.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[out] pDst points to the output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_sub_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
float64_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and store result in destination buffer. */
*pDst++ = (*pSrcA++) - (*pSrcB++);
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of BasicSub group
*/

View File

@@ -0,0 +1,128 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_sub_q15.c
* Description: Q15 vector subtraction
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup BasicSub
* @{
*/
/**
* @brief Q15 vector subtraction.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
*/
void arm_sub_q15(
q15_t * pSrcA,
q15_t * pSrcB,
q15_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2;
q31_t inB1, inB2;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the results in the destination buffer two samples at a time. */
inA1 = *__SIMD32(pSrcA)++;
inA2 = *__SIMD32(pSrcA)++;
inB1 = *__SIMD32(pSrcB)++;
inB2 = *__SIMD32(pSrcB)++;
*__SIMD32(pDst)++ = __QSUB16(inA1, inB1);
*__SIMD32(pDst)++ = __QSUB16(inA2, inB2);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the result in the destination buffer. */
*pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the result in the destination buffer. */
*pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ - *pSrcB++), 16);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicSub group
*/

View File

@@ -0,0 +1,134 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_sub_q31.c
* Description: Q31 vector subtraction
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup BasicSub
* @{
*/
/**
* @brief Q31 vector subtraction.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
*/
void arm_sub_q31(
q31_t * pSrcA,
q31_t * pSrcB,
q31_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inA1, inA2, inA3, inA4;
q31_t inB1, inB2, inB3, inB4;
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the results in the destination buffer. */
inA1 = *pSrcA++;
inA2 = *pSrcA++;
inB1 = *pSrcB++;
inB2 = *pSrcB++;
inA3 = *pSrcA++;
inA4 = *pSrcA++;
inB3 = *pSrcB++;
inB4 = *pSrcB++;
*pDst++ = __QSUB(inA1, inB1);
*pDst++ = __QSUB(inA2, inB2);
*pDst++ = __QSUB(inA3, inB3);
*pDst++ = __QSUB(inA4, inB4);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the result in the destination buffer. */
*pDst++ = __QSUB(*pSrcA++, *pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the result in the destination buffer. */
*pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrcA++ - *pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicSub group
*/

View File

@@ -0,0 +1,119 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_sub_q7.c
* Description: Q7 vector subtraction
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupMath
*/
/**
* @addtogroup BasicSub
* @{
*/
/**
* @brief Q7 vector subtraction.
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] blockSize number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
*/
void arm_sub_q7(
q7_t * pSrcA,
q7_t * pSrcB,
q7_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/*loop Unrolling */
blkCnt = blockSize >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the results in the destination buffer 4 samples at a time. */
*__SIMD32(pDst)++ = __QSUB8(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);
/* Decrement the loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = blockSize % 0x4U;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the result in the destination buffer. */
*pDst++ = __SSAT(*pSrcA++ - *pSrcB++, 8);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
while (blkCnt > 0U)
{
/* C = A - B */
/* Subtract and then store the result in the destination buffer. */
*pDst++ = (q7_t) __SSAT((q15_t) * pSrcA++ - *pSrcB++, 8);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of BasicSub group
*/

View File

@@ -0,0 +1,137 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_xor_u16.c
* Description: uint16_t bitwise exclusive OR
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@defgroup Xor Vector bitwise exclusive OR
Compute the logical bitwise XOR.
There are separate functions for uint32_t, uint16_t, and uint8_t data types.
*/
/**
@addtogroup Xor
@{
*/
/**
@brief Compute the logical bitwise XOR of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_xor_u16(
const uint16_t * pSrcA,
const uint16_t * pSrcB,
uint16_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecSrcA, vecSrcB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, veorq_u16(vecSrcA, vecSrcB) );
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrhq_p(pDst, veorq_u16(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint16x8_t vecA, vecB;
/* Compute 8 outputs at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
vecA = vld1q_u16(pSrcA);
vecB = vld1q_u16(pSrcB);
vst1q_u16(pDst, veorq_u16(vecA, vecB) );
pSrcA += 8;
pSrcB += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 7;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)^(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Xor group
*/

View File

@@ -0,0 +1,129 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_xor_u32.c
* Description: uint32_t bitwise exclusive OR
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup Xor
@{
*/
/**
@brief Compute the logical bitwise XOR of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_xor_u32(
const uint32_t * pSrcA,
const uint32_t * pSrcB,
uint32_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecSrcA, vecSrcB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, veorq_u32(vecSrcA, vecSrcB) );
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrwq_p(pDst, veorq_u32(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint32x4_t vecA, vecB;
/* Compute 4 outputs at a time */
blkCnt = blockSize >> 2U;
while (blkCnt > 0U)
{
vecA = vld1q_u32(pSrcA);
vecB = vld1q_u32(pSrcB);
vst1q_u32(pDst, veorq_u32(vecA, vecB) );
pSrcA += 4;
pSrcB += 4;
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)^(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Xor group
*/

View File

@@ -0,0 +1,129 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_xor_u8.c
* Description: uint8_t bitwise exclusive OR
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/basic_math_functions.h"
/**
@ingroup groupMath
*/
/**
@addtogroup Xor
@{
*/
/**
@brief Compute the logical bitwise XOR of two fixed-point vectors.
@param[in] pSrcA points to input vector A
@param[in] pSrcB points to input vector B
@param[out] pDst points to output vector
@param[in] blockSize number of samples in each vector
@return none
*/
void arm_xor_u8(
const uint8_t * pSrcA,
const uint8_t * pSrcB,
uint8_t * pDst,
uint32_t blockSize)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecSrcA, vecSrcB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4;
while (blkCnt > 0U)
{
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vst1q(pDst, veorq_u8(vecSrcA, vecSrcB) );
pSrcA += 16;
pSrcB += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
vstrbq_p(pDst, veorq_u8(vecSrcA, vecSrcB), p0);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
uint8x16_t vecA, vecB;
/* Compute 16 outputs at a time */
blkCnt = blockSize >> 4U;
while (blkCnt > 0U)
{
vecA = vld1q_u8(pSrcA);
vecB = vld1q_u8(pSrcB);
vst1q_u8(pDst, veorq_u8(vecA, vecB) );
pSrcA += 16;
pSrcB += 16;
pDst += 16;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 0xF;
#else
/* Initialize blkCnt with number of samples */
blkCnt = blockSize;
#endif
while (blkCnt > 0U)
{
*pDst++ = (*pSrcA++)^(*pSrcB++);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* if defined(ARM_MATH_MVEI) */
}
/**
@} end of Xor group
*/

View File

@@ -0,0 +1,29 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: BayesFunctions.c
* Description: Combination of all bayes function source files.
*
* $Date: 16. March 2020
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_gaussian_naive_bayes_predict_f32.c"

View File

@@ -0,0 +1,27 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: BayesFunctions.c
* Description: Combination of all bayes function f16 source files.
*
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_gaussian_naive_bayes_predict_f16.c"

View File

@@ -0,0 +1,23 @@
cmake_minimum_required (VERSION 3.14)
project(CMSISDSPBayes)
include(configLib)
include(configDsp)
file(GLOB SRC "./*_*.c")
add_library(CMSISDSPBayes STATIC)
target_sources(CMSISDSPBayes PRIVATE arm_gaussian_naive_bayes_predict_f32.c)
configLib(CMSISDSPBayes ${ROOT})
configDsp(CMSISDSPBayes ${ROOT})
### Includes
target_include_directories(CMSISDSPBayes PUBLIC "${DSP}/Include")
if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
target_sources(CMSISDSPBayes PRIVATE arm_gaussian_naive_bayes_predict_f16.c)
endif()

View File

@@ -0,0 +1,208 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_naive_gaussian_bayes_predict_f16
* Description: Naive Gaussian Bayesian Estimator
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/bayes_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
#include <limits.h>
#include <math.h>
#define PI_F 3.1415926535897932384626433832795f16
/**
* @addtogroup groupBayes
* @{
*/
/**
* @brief Naive Gaussian Bayesian Estimator
*
* @param[in] *S points to a naive bayes instance structure
* @param[in] *in points to the elements of the input vector.
* @param[out] *pOutputProbabilities points to a buffer of length numberOfClasses containing estimated probabilities
* @param[out] *pBufferB points to a temporary buffer of length numberOfClasses
* @return The predicted class
*
*
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#include "arm_vec_math_f16.h"
uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_instance_f16 *S,
const float16_t * in,
float16_t *pOutputProbabilities,
float16_t *pBufferB
)
{
uint32_t nbClass;
const float16_t *pTheta = S->theta;
const float16_t *pSigma = S->sigma;
float16_t *buffer = pOutputProbabilities;
const float16_t *pIn = in;
float16_t result;
f16x8_t vsigma;
_Float16 tmp;
f16x8_t vacc1, vacc2;
uint32_t index;
float16_t *logclassPriors=pBufferB;
float16_t *pLogPrior = logclassPriors;
arm_vlog_f16((float16_t *) S->classPriors, logclassPriors, S->numberOfClasses);
pTheta = S->theta;
pSigma = S->sigma;
for (nbClass = 0; nbClass < S->numberOfClasses; nbClass++) {
pIn = in;
vacc1 = vdupq_n_f16(0.0f16);
vacc2 = vdupq_n_f16(0.0f16);
uint32_t blkCnt =S->vectorDimension >> 3;
while (blkCnt > 0U) {
f16x8_t vinvSigma, vtmp;
vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon);
vacc1 = vaddq(vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f16 * (_Float16)PI)));
vinvSigma = vrecip_medprec_f16(vsigma);
vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
/* squaring */
vtmp = vmulq(vtmp, vtmp);
vacc2 = vfmaq(vacc2, vtmp, vinvSigma);
pIn += 8;
pTheta += 8;
pSigma += 8;
blkCnt--;
}
blkCnt = S->vectorDimension & 7;
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
f16x8_t vinvSigma, vtmp;
vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon);
vacc1 =
vaddq_m_f16(vacc1, vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f16 * (_Float16)PI)), p0);
vinvSigma = vrecip_medprec_f16(vsigma);
vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
/* squaring */
vtmp = vmulq(vtmp, vtmp);
vacc2 = vfmaq_m_f16(vacc2, vtmp, vinvSigma, p0);
pTheta += blkCnt;
pSigma += blkCnt;
}
tmp = -0.5f16 * (_Float16)vecAddAcrossF16Mve(vacc1);
tmp -= 0.5f16 * (_Float16)vecAddAcrossF16Mve(vacc2);
*buffer = tmp + *pLogPrior++;
buffer++;
}
arm_max_f16(pOutputProbabilities, S->numberOfClasses, &result, &index);
return (index);
}
#else
uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_instance_f16 *S,
const float16_t * in,
float16_t *pOutputProbabilities,
float16_t *pBufferB)
{
uint32_t nbClass;
uint32_t nbDim;
const float16_t *pPrior = S->classPriors;
const float16_t *pTheta = S->theta;
const float16_t *pSigma = S->sigma;
float16_t *buffer = pOutputProbabilities;
const float16_t *pIn=in;
float16_t result;
_Float16 sigma;
_Float16 tmp;
_Float16 acc1,acc2;
uint32_t index;
(void)pBufferB;
pTheta=S->theta;
pSigma=S->sigma;
for(nbClass = 0; nbClass < S->numberOfClasses; nbClass++)
{
pIn = in;
tmp = 0.0f16;
acc1 = 0.0f16;
acc2 = 0.0f16;
for(nbDim = 0; nbDim < S->vectorDimension; nbDim++)
{
sigma = *pSigma + S->epsilon;
acc1 += logf(2.0f16 * (_Float16)PI_F * sigma);
acc2 += (*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
pIn++;
pTheta++;
pSigma++;
}
tmp = -0.5f16 * acc1;
tmp -= 0.5f16 * acc2;
*buffer = tmp + logf(*pPrior++);
buffer++;
}
arm_max_f16(pOutputProbabilities,S->numberOfClasses,&result,&index);
return(index);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of groupBayes group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,396 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_naive_gaussian_bayes_predict_f32
* Description: Naive Gaussian Bayesian Estimator
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/bayes_functions.h"
#include <limits.h>
#include <math.h>
#define PI_F 3.1415926535897932384626433832795f
#define DPI_F (2.0f*3.1415926535897932384626433832795f)
/**
* @addtogroup groupBayes
* @{
*/
/**
* @brief Naive Gaussian Bayesian Estimator
*
* @param[in] *S points to a naive bayes instance structure
* @param[in] *in points to the elements of the input vector.
* @param[out] *pOutputProbabilities points to a buffer of length numberOfClasses containing estimated probabilities
* @param[out] *pBufferB points to a temporary buffer of length numberOfClasses
* @return The predicted class
*
*
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#include "arm_vec_math.h"
uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
const float32_t * in,
float32_t *pOutputProbabilities,
float32_t *pBufferB
)
{
uint32_t nbClass;
const float32_t *pTheta = S->theta;
const float32_t *pSigma = S->sigma;
float32_t *buffer = pOutputProbabilities;
const float32_t *pIn = in;
float32_t result;
f32x4_t vsigma;
float32_t tmp;
f32x4_t vacc1, vacc2;
uint32_t index;
float32_t *logclassPriors=pBufferB;
float32_t *pLogPrior = logclassPriors;
arm_vlog_f32((float32_t *) S->classPriors, logclassPriors, S->numberOfClasses);
pTheta = S->theta;
pSigma = S->sigma;
for (nbClass = 0; nbClass < S->numberOfClasses; nbClass++) {
pIn = in;
vacc1 = vdupq_n_f32(0);
vacc2 = vdupq_n_f32(0);
uint32_t blkCnt =S->vectorDimension >> 2;
while (blkCnt > 0U) {
f32x4_t vinvSigma, vtmp;
vsigma = vaddq_n_f32(vld1q(pSigma), S->epsilon);
vacc1 = vaddq(vacc1, vlogq_f32(vmulq_n_f32(vsigma, 2.0f * PI)));
vinvSigma = vrecip_medprec_f32(vsigma);
vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
/* squaring */
vtmp = vmulq(vtmp, vtmp);
vacc2 = vfmaq(vacc2, vtmp, vinvSigma);
pIn += 4;
pTheta += 4;
pSigma += 4;
blkCnt--;
}
blkCnt = S->vectorDimension & 3;
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp32q(blkCnt);
f32x4_t vinvSigma, vtmp;
vsigma = vaddq_n_f32(vld1q(pSigma), S->epsilon);
vacc1 =
vaddq_m_f32(vacc1, vacc1, vlogq_f32(vmulq_n_f32(vsigma, 2.0f * PI)), p0);
vinvSigma = vrecip_medprec_f32(vsigma);
vtmp = vsubq(vld1q(pIn), vld1q(pTheta));
/* squaring */
vtmp = vmulq(vtmp, vtmp);
vacc2 = vfmaq_m_f32(vacc2, vtmp, vinvSigma, p0);
pTheta += blkCnt;
pSigma += blkCnt;
}
tmp = -0.5f * vecAddAcrossF32Mve(vacc1);
tmp -= 0.5f * vecAddAcrossF32Mve(vacc2);
*buffer = tmp + *pLogPrior++;
buffer++;
}
arm_max_f32(pOutputProbabilities, S->numberOfClasses, &result, &index);
return (index);
}
#else
#if defined(ARM_MATH_NEON)
#include "NEMath.h"
uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
const float32_t * in,
float32_t *pOutputProbabilities,
float32_t *pBufferB)
{
const float32_t *pPrior = S->classPriors;
const float32_t *pTheta = S->theta;
const float32_t *pSigma = S->sigma;
const float32_t *pTheta1 = S->theta + S->vectorDimension;
const float32_t *pSigma1 = S->sigma + S->vectorDimension;
float32_t *buffer = pOutputProbabilities;
const float32_t *pIn=in;
float32_t result;
float32_t sigma,sigma1;
float32_t tmp,tmp1;
uint32_t index;
uint32_t vecBlkCnt;
uint32_t classBlkCnt;
float32x4_t epsilonV;
float32x4_t sigmaV,sigmaV1;
float32x4_t tmpV,tmpVb,tmpV1;
float32x2_t tmpV2;
float32x4_t thetaV,thetaV1;
float32x4_t inV;
(void)pBufferB;
epsilonV = vdupq_n_f32(S->epsilon);
classBlkCnt = S->numberOfClasses >> 1;
while(classBlkCnt > 0)
{
pIn = in;
tmp = logf(*pPrior++);
tmp1 = logf(*pPrior++);
tmpV = vdupq_n_f32(0.0f);
tmpV1 = vdupq_n_f32(0.0f);
vecBlkCnt = S->vectorDimension >> 2;
while(vecBlkCnt > 0)
{
sigmaV = vld1q_f32(pSigma);
thetaV = vld1q_f32(pTheta);
sigmaV1 = vld1q_f32(pSigma1);
thetaV1 = vld1q_f32(pTheta1);
inV = vld1q_f32(pIn);
sigmaV = vaddq_f32(sigmaV, epsilonV);
sigmaV1 = vaddq_f32(sigmaV1, epsilonV);
tmpVb = vmulq_n_f32(sigmaV,DPI_F);
tmpVb = vlogq_f32(tmpVb);
tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
tmpVb = vmulq_n_f32(sigmaV1,DPI_F);
tmpVb = vlogq_f32(tmpVb);
tmpV1 = vmlsq_n_f32(tmpV1,tmpVb,0.5f);
tmpVb = vsubq_f32(inV,thetaV);
tmpVb = vmulq_f32(tmpVb,tmpVb);
tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV));
tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
tmpVb = vsubq_f32(inV,thetaV1);
tmpVb = vmulq_f32(tmpVb,tmpVb);
tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV1));
tmpV1 = vmlsq_n_f32(tmpV1,tmpVb,0.5f);
pIn += 4;
pTheta += 4;
pSigma += 4;
pTheta1 += 4;
pSigma1 += 4;
vecBlkCnt--;
}
tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV));
tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
tmpV2 = vpadd_f32(vget_low_f32(tmpV1),vget_high_f32(tmpV1));
tmp1 += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
vecBlkCnt = S->vectorDimension & 3;
while(vecBlkCnt > 0)
{
sigma = *pSigma + S->epsilon;
sigma1 = *pSigma1 + S->epsilon;
tmp -= 0.5f*logf(2.0f * PI_F * sigma);
tmp -= 0.5f*(*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
tmp1 -= 0.5f*logf(2.0f * PI_F * sigma1);
tmp1 -= 0.5f*(*pIn - *pTheta1) * (*pIn - *pTheta1) / sigma1;
pIn++;
pTheta++;
pSigma++;
pTheta1++;
pSigma1++;
vecBlkCnt--;
}
*buffer++ = tmp;
*buffer++ = tmp1;
pSigma += S->vectorDimension;
pTheta += S->vectorDimension;
pSigma1 += S->vectorDimension;
pTheta1 += S->vectorDimension;
classBlkCnt--;
}
classBlkCnt = S->numberOfClasses & 1;
while(classBlkCnt > 0)
{
pIn = in;
tmp = logf(*pPrior++);
tmpV = vdupq_n_f32(0.0f);
vecBlkCnt = S->vectorDimension >> 2;
while(vecBlkCnt > 0)
{
sigmaV = vld1q_f32(pSigma);
thetaV = vld1q_f32(pTheta);
inV = vld1q_f32(pIn);
sigmaV = vaddq_f32(sigmaV, epsilonV);
tmpVb = vmulq_n_f32(sigmaV,DPI_F);
tmpVb = vlogq_f32(tmpVb);
tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
tmpVb = vsubq_f32(inV,thetaV);
tmpVb = vmulq_f32(tmpVb,tmpVb);
tmpVb = vmulq_f32(tmpVb, vinvq_f32(sigmaV));
tmpV = vmlsq_n_f32(tmpV,tmpVb,0.5f);
pIn += 4;
pTheta += 4;
pSigma += 4;
vecBlkCnt--;
}
tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV));
tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
vecBlkCnt = S->vectorDimension & 3;
while(vecBlkCnt > 0)
{
sigma = *pSigma + S->epsilon;
tmp -= 0.5f*logf(2.0f * PI_F * sigma);
tmp -= 0.5f*(*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
pIn++;
pTheta++;
pSigma++;
vecBlkCnt--;
}
*buffer++ = tmp;
classBlkCnt--;
}
arm_max_f32(pOutputProbabilities,S->numberOfClasses,&result,&index);
return(index);
}
#else
uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_instance_f32 *S,
const float32_t * in,
float32_t *pOutputProbabilities,
float32_t *pBufferB)
{
uint32_t nbClass;
uint32_t nbDim;
const float32_t *pPrior = S->classPriors;
const float32_t *pTheta = S->theta;
const float32_t *pSigma = S->sigma;
float32_t *buffer = pOutputProbabilities;
const float32_t *pIn=in;
float32_t result;
float32_t sigma;
float32_t tmp;
float32_t acc1,acc2;
uint32_t index;
(void)pBufferB;
pTheta=S->theta;
pSigma=S->sigma;
for(nbClass = 0; nbClass < S->numberOfClasses; nbClass++)
{
pIn = in;
tmp = 0.0;
acc1 = 0.0f;
acc2 = 0.0f;
for(nbDim = 0; nbDim < S->vectorDimension; nbDim++)
{
sigma = *pSigma + S->epsilon;
acc1 += logf(2.0f * PI_F * sigma);
acc2 += (*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
pIn++;
pTheta++;
pSigma++;
}
tmp = -0.5f * acc1;
tmp -= 0.5f * acc2;
*buffer = tmp + logf(*pPrior++);
buffer++;
}
arm_max_f32(pOutputProbabilities,S->numberOfClasses,&result,&index);
return(index);
}
#endif
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of groupBayes group
*/

View File

@@ -0,0 +1,310 @@
cmake_minimum_required (VERSION 3.14)
cmake_policy(SET CMP0077 NEW)
project(CMSISDSP)
# DSP Sources
SET(DSP ${ROOT}/CMSIS/DSP)
list(APPEND CMAKE_MODULE_PATH ${DSP}/Source)
list(APPEND CMAKE_MODULE_PATH ${DSP})
include(configLib)
option(NEON "Neon acceleration" OFF)
option(NEONEXPERIMENTAL "Neon experimental acceleration" OFF)
option(HELIUMEXPERIMENTAL "Helium experimental acceleration" OFF)
option(LOOPUNROLL "Loop unrolling" ON)
option(ROUNDING "Rounding" OFF)
option(MATRIXCHECK "Matrix Checks" OFF)
option(HELIUM "Helium acceleration (MVEF and MVEI supported)" OFF)
option(MVEF "MVEF intrinsics supported" OFF)
option(MVEI "MVEI intrinsics supported" OFF)
option(MVEFLOAT16 "Float16 MVE intrinsics supported" OFF)
option(DISABLEFLOAT16 "Disable building float16 kernels" OFF)
option(HOST "Build for host" OFF)
# Select which parts of the CMSIS-DSP must be compiled.
# There are some dependencies between the parts but they are not tracked
# by this cmake. So, enabling some functions may require to enable some
# other ones.
option(BASICMATH "Basic Math Functions" ON)
option(COMPLEXMATH "Complex Math Functions" ON)
option(CONTROLLER "Controller Functions" ON)
option(FASTMATH "Fast Math Functions" ON)
option(FILTERING "Filtering Functions" ON)
option(MATRIX "Matrix Functions" ON)
option(STATISTICS "Statistics Functions" ON)
option(SUPPORT "Support Functions" ON)
option(TRANSFORM "Transform Functions" ON)
option(SVM "Support Vector Machine Functions" ON)
option(BAYES "Bayesian Estimators" ON)
option(DISTANCE "Distance Functions" ON)
option(INTERPOLATION "Interpolation Functions" ON)
option(QUATERNIONMATH "Quaternion Math Functions" ON)
# When OFF it is the default behavior : all tables are included.
option(CONFIGTABLE "Configuration of table allowed" OFF)
# When CONFIGTABLE is ON, select if all interpolation tables must be included
option(ALLFAST "All interpolation tables included" OFF)
# When CONFIGTABLE is ON, select if all FFT tables must be included
option(ALLFFT "All fft tables included" OFF)
# Features which require inclusion of a data table.
# Since some tables may be big, the corresponding feature can be
# disabled.
# Those options are taken into account only when CONFIGTABLE is ON
option(ARM_COS_F32 "cos f32" OFF)
option(ARM_COS_Q31 "cos q31" OFF)
option(ARM_COS_Q15 "cos q15" OFF)
option(ARM_SIN_F32 "sin f32" OFF)
option(ARM_SIN_Q31 "sin q31" OFF)
option(ARM_SIN_Q15 "sin q15" OFF)
option(ARM_SIN_COS_F32 "sin cos f32" OFF)
option(ARM_SIN_COS_Q31 "sin cos q31" OFF)
option(ARM_LMS_NORM_Q31 "lms norm q31" OFF)
option(ARM_LMS_NORM_Q15 "lms norm q15" OFF)
option(CFFT_F64_16 "cfft f64 16" OFF)
option(CFFT_F64_32 "cfft f64 32" OFF)
option(CFFT_F64_64 "cfft f64 64" OFF)
option(CFFT_F64_128 "cfft f64 128" OFF)
option(CFFT_F64_256 "cfft f64 256" OFF)
option(CFFT_F64_512 "cfft f64 512" OFF)
option(CFFT_F64_1024 "cfft f64 1024" OFF)
option(CFFT_F64_2048 "cfft f64 2048" OFF)
option(CFFT_F64_4096 "cfft f64 4096" OFF)
option(CFFT_F32_16 "cfft f32 16" OFF)
option(CFFT_F32_32 "cfft f32 32" OFF)
option(CFFT_F32_64 "cfft f32 64" OFF)
option(CFFT_F32_128 "cfft f32 128" OFF)
option(CFFT_F32_256 "cfft f32 256" OFF)
option(CFFT_F32_512 "cfft f32 512" OFF)
option(CFFT_F32_1024 "cfft f32 1024" OFF)
option(CFFT_F32_2048 "cfft f32 2048" OFF)
option(CFFT_F32_4096 "cfft f32 4096" OFF)
option(CFFT_Q31_16 "cfft q31 16" OFF)
option(CFFT_Q31_32 "cfft q31 32" OFF)
option(CFFT_Q31_64 "cfft q31 64" OFF)
option(CFFT_Q31_128 "cfft q31 128" OFF)
option(CFFT_Q31_256 "cfft q31 256" OFF)
option(CFFT_Q31_512 "cfft q31 512" OFF)
option(CFFT_Q31_1024 "cfft q31 1024" OFF)
option(CFFT_Q31_2048 "cfft q31 2048" OFF)
option(CFFT_Q31_4096 "cfft q31 4096" OFF)
option(CFFT_Q15_16 "cfft q15 16" OFF)
option(CFFT_Q15_32 "cfft q15 32" OFF)
option(CFFT_Q15_64 "cfft q15 64" OFF)
option(CFFT_Q15_128 "cfft q15 128" OFF)
option(CFFT_Q15_256 "cfft q15 256" OFF)
option(CFFT_Q15_512 "cfft q15 512" OFF)
option(CFFT_Q15_1024 "cfft q15 1024" OFF)
option(CFFT_Q15_2048 "cfft q15 2048" OFF)
option(CFFT_Q15_4096 "cfft q15 4096" OFF)
option(RFFT_FAST_F32_32 "rfft fast f32 32" OFF)
option(RFFT_FAST_F32_64 "rfft fast f32 64" OFF)
option(RFFT_FAST_F32_128 "rfft fast f32 128" OFF)
option(RFFT_FAST_F32_256 "rfft fast f32 256" OFF)
option(RFFT_FAST_F32_512 "rfft fast f32 512" OFF)
option(RFFT_FAST_F32_1024 "rfft fast f32 1024" OFF)
option(RFFT_FAST_F32_2048 "rfft fast f32 2048" OFF)
option(RFFT_FAST_F32_4096 "rfft fast f32 4096" OFF)
option(RFFT_F32_128 "rfft f32 128" OFF)
option(RFFT_F32_512 "rfft f32 512" OFF)
option(RFFT_F32_2048 "rfft f32 2048" OFF)
option(RFFT_F32_8192 "rfft f32 8192" OFF)
option(RFFT_FAST_F64_32 "rfft fast f64 32" OFF)
option(RFFT_FAST_F64_64 "rfft fast f64 64" OFF)
option(RFFT_FAST_F64_128 "rfft fast f64 128" OFF)
option(RFFT_FAST_F64_256 "rfft fast f64 256" OFF)
option(RFFT_FAST_F64_512 "rfft fast f64 512" OFF)
option(RFFT_FAST_F64_1024 "rfft fast f64 1024" OFF)
option(RFFT_FAST_F64_2048 "rfft fast f64 2048" OFF)
option(RFFT_FAST_F64_4096 "rfft fast f64 4096" OFF)
option(RFFT_F64_128 "rfft f64 128" OFF)
option(RFFT_F64_512 "rfft f64 512" OFF)
option(RFFT_F64_2048 "rfft f64 2048" OFF)
option(RFFT_F64_8192 "rfft f64 8192" OFF)
option(RFFT_FAST_F16_32 "rfft fast f16 32" OFF)
option(RFFT_FAST_F16_64 "rfft fast f16 64" OFF)
option(RFFT_FAST_F16_128 "rfft fast f16 128" OFF)
option(RFFT_FAST_F16_256 "rfft fast f16 256" OFF)
option(RFFT_FAST_F16_512 "rfft fast f16 512" OFF)
option(RFFT_FAST_F16_1024 "rfft fast f16 1024" OFF)
option(RFFT_FAST_F16_2048 "rfft fast f16 2048" OFF)
option(RFFT_FAST_F16_4096 "rfft fast f16 4096" OFF)
option(RFFT_Q31_32 "rfft q31 32" OFF)
option(RFFT_Q31_64 "rfft q31 64" OFF)
option(RFFT_Q31_128 "rfft q31 128" OFF)
option(RFFT_Q31_256 "rfft q31 256" OFF)
option(RFFT_Q31_512 "rfft q31 512" OFF)
option(RFFT_Q31_1024 "rfft q31 1024" OFF)
option(RFFT_Q31_2048 "rfft q31 2048" OFF)
option(RFFT_Q31_4096 "rfft q31 4096" OFF)
option(RFFT_Q31_8192 "rfft q31 8192" OFF)
option(RFFT_Q15_32 "rfft q15 32" OFF)
option(RFFT_Q15_64 "rfft q15 64" OFF)
option(RFFT_Q15_128 "rfft q15 128" OFF)
option(RFFT_Q15_256 "rfft q15 256" OFF)
option(RFFT_Q15_512 "rfft q15 512" OFF)
option(RFFT_Q15_1024 "rfft q15 1024" OFF)
option(RFFT_Q15_2048 "rfft q15 2048" OFF)
option(RFFT_Q15_4096 "rfft q15 4096" OFF)
option(RFFT_Q15_8192 "rfft q15 8192" OFF)
option(DCT4_F32_128 "dct4 f32 128" OFF)
option(DCT4_F32_512 "dct4 f32 512" OFF)
option(DCT4_F32_2048 "dct4 f32 2048" OFF)
option(DCT4_F32_8192 "dct4 f32 8192" OFF)
option(DCT4_Q31_128 "dct4 q31 128" OFF)
option(DCT4_Q31_512 "dct4 q31 512" OFF)
option(DCT4_Q31_2048 "dct4 q31 2048" OFF)
option(DCT4_Q31_8192 "dct4 q31 8192" OFF)
option(DCT4_Q15_128 "dct4 q15 128" OFF)
option(DCT4_Q15_512 "dct4 q15 512" OFF)
option(DCT4_Q15_2048 "dct4 q15 2048" OFF)
option(DCT4_Q15_8192 "dct4 q15 8192" OFF)
option(ARM_CFFT_RADIX2_Q15 "deprecated q15 radix2 cfft" OFF)
option(ARM_CFFT_RADIX4_Q15 "deprecated q15 radix4 cfft" OFF)
option(ARM_CFFT_RADIX2_Q31 "deprecated q31 radix2 cfft" OFF)
option(ARM_CFFT_RADIX4_Q31 "deprecated q31 radix4 cfft" OFF)
###########################
#
# CMSIS DSP
#
###########################
add_library(CMSISDSP INTERFACE)
if (BASICMATH)
add_subdirectory(BasicMathFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPBasicMath)
endif()
if (COMPLEXMATH)
add_subdirectory(ComplexMathFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPComplexMath)
endif()
if (QUATERNIONMATH)
add_subdirectory(QuaternionMathFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPQuaternionMath)
endif()
if (CONTROLLER)
add_subdirectory(ControllerFunctions)
# Fast tables inclusion is allowed
if (CONFIGTABLE)
target_compile_definitions(CMSISDSPController PUBLIC ARM_FAST_ALLOW_TABLES)
endif()
target_link_libraries(CMSISDSP INTERFACE CMSISDSPController)
endif()
if (FASTMATH)
add_subdirectory(FastMathFunctions)
# Fast tables inclusion is allowed
if (CONFIGTABLE)
target_compile_definitions(CMSISDSPFastMath PUBLIC ARM_FAST_ALLOW_TABLES)
endif()
target_link_libraries(CMSISDSP INTERFACE CMSISDSPFastMath)
endif()
if (FILTERING)
add_subdirectory(FilteringFunctions)
# Fast tables inclusion is allowed
if (CONFIGTABLE)
target_compile_definitions(CMSISDSPFiltering PUBLIC ARM_FAST_ALLOW_TABLES)
endif()
target_link_libraries(CMSISDSP INTERFACE CMSISDSPFiltering)
endif()
if (MATRIX)
add_subdirectory(MatrixFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPMatrix)
endif()
if (STATISTICS)
add_subdirectory(StatisticsFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPStatistics)
endif()
if (SUPPORT)
add_subdirectory(SupportFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPSupport)
endif()
if (TRANSFORM)
add_subdirectory(TransformFunctions)
# FFT tables inclusion is allowed
if (CONFIGTABLE)
target_compile_definitions(CMSISDSPTransform PUBLIC ARM_FFT_ALLOW_TABLES)
endif()
target_link_libraries(CMSISDSP INTERFACE CMSISDSPTransform)
endif()
if (FILTERING OR CONTROLLER OR FASTMATH OR TRANSFORM OR SVM OR DISTANCE)
add_subdirectory(CommonTables)
if (TRANSFORM)
# FFT tables inclusion is allowed
if (CONFIGTABLE)
target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FFT_ALLOW_TABLES)
endif()
endif()
if (FILTERING OR CONTROLLER OR FASTMATH)
# Select which tables to include
if (CONFIGTABLE)
target_compile_definitions(CMSISDSPCommon PUBLIC ARM_FAST_ALLOW_TABLES)
endif()
endif()
target_link_libraries(CMSISDSP INTERFACE CMSISDSPCommon)
# Common project is adding ComputeLibrary tables used by SVM and Distance
# when NEon is ON.
endif()
if (SVM)
add_subdirectory(SVMFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPSVM)
endif()
if (BAYES)
add_subdirectory(BayesFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPBayes)
endif()
if (DISTANCE)
add_subdirectory(DistanceFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPDistance)
endif()
if (INTERPOLATION)
add_subdirectory(InterpolationFunctions)
target_link_libraries(CMSISDSP INTERFACE CMSISDSPInterpolation)
endif()
### Includes
target_include_directories(CMSISDSP INTERFACE "${DSP}/Include")

View File

@@ -0,0 +1,52 @@
cmake_minimum_required (VERSION 3.14)
project(CMSISDSPCommon)
include(configLib)
include(configDsp)
add_library(CMSISDSPCommon STATIC arm_common_tables.c arm_common_tables_f16.c)
configLib(CMSISDSPCommon ${ROOT})
configDsp(CMSISDSPCommon ${ROOT})
if (CONFIGTABLE AND ALLFFT)
target_compile_definitions(CMSISDSPCommon PUBLIC ARM_ALL_FFT_TABLES)
endif()
if (CONFIGTABLE AND ALLFAST)
target_compile_definitions(CMSISDSPCommon PUBLIC ARM_ALL_FAST_TABLES)
endif()
include(fft)
fft(CMSISDSPCommon)
include(interpol)
interpol(CMSISDSPCommon)
target_sources(CMSISDSPCommon PRIVATE arm_const_structs.c)
target_sources(CMSISDSPCommon PRIVATE arm_const_structs_f16.c)
### Includes
target_include_directories(CMSISDSPCommon PUBLIC "${DSP}/Include")
if (NEON OR NEONEXPERIMENTAL)
target_sources(CMSISDSPCommon PRIVATE "${DSP}/ComputeLibrary/Source/arm_cl_tables.c")
endif()
if (HELIUM OR MVEF)
target_sources(CMSISDSPCommon PRIVATE "${DSP}/Source/CommonTables/arm_mve_tables.c")
target_sources(CMSISDSPCommon PRIVATE "${DSP}/Source/CommonTables/arm_mve_tables_f16.c")
endif()
if (WRAPPER)
target_compile_definitions(CMSISDSPCommon PUBLIC ARM_TABLE_BITREV_1024)
target_compile_definitions(CMSISDSPCommon PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
target_compile_definitions(CMSISDSPCommon PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
target_compile_definitions(CMSISDSPCommon PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
target_compile_definitions(CMSISDSPCommon PUBLIC ARM_TABLE_TWIDDLECOEF_F16_4096)
endif()
endif()

View File

@@ -0,0 +1,31 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: CommonTables.c
* Description: Combination of all common table source files.
*
* $Date: 08. January 2020
* $Revision: V1.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_common_tables.c"
#include "arm_const_structs.c"
#include "arm_mve_tables.c"

View File

@@ -0,0 +1,31 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: CommonTables.c
* Description: Combination of all common table source files.
*
* $Date: 08. January 2020
* $Revision: V1.1.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_common_tables_f16.c"
#include "arm_const_structs_f16.c"
#include "arm_mve_tables_f16.c"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,379 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_const_structs.c
* Description: Constant structs that are initialized for user convenience.
* For example, some can be given as arguments to the arm_cfft_f32() or arm_rfft_f32() functions.
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_const_structs.h"
/* Floating-point structs */
const arm_cfft_instance_f32 arm_cfft_sR_f32_len16 = {
16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE_16_TABLE_LENGTH
};
const arm_cfft_instance_f32 arm_cfft_sR_f32_len32 = {
32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH
};
const arm_cfft_instance_f32 arm_cfft_sR_f32_len64 = {
64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH
};
const arm_cfft_instance_f32 arm_cfft_sR_f32_len128 = {
128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH
};
const arm_cfft_instance_f32 arm_cfft_sR_f32_len256 = {
256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH
};
const arm_cfft_instance_f32 arm_cfft_sR_f32_len512 = {
512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH
};
const arm_cfft_instance_f32 arm_cfft_sR_f32_len1024 = {
1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH
};
const arm_cfft_instance_f32 arm_cfft_sR_f32_len2048 = {
2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH
};
const arm_cfft_instance_f32 arm_cfft_sR_f32_len4096 = {
4096, twiddleCoef_4096, armBitRevIndexTable4096, ARMBITREVINDEXTABLE_4096_TABLE_LENGTH
};
/* Fixed-point structs */
const arm_cfft_instance_q31 arm_cfft_sR_q31_len16 = {
16, twiddleCoef_16_q31, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
};
const arm_cfft_instance_q31 arm_cfft_sR_q31_len32 = {
32, twiddleCoef_32_q31, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
};
const arm_cfft_instance_q31 arm_cfft_sR_q31_len64 = {
64, twiddleCoef_64_q31, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
};
const arm_cfft_instance_q31 arm_cfft_sR_q31_len128 = {
128, twiddleCoef_128_q31, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
};
const arm_cfft_instance_q31 arm_cfft_sR_q31_len256 = {
256, twiddleCoef_256_q31, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
};
const arm_cfft_instance_q31 arm_cfft_sR_q31_len512 = {
512, twiddleCoef_512_q31, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
};
const arm_cfft_instance_q31 arm_cfft_sR_q31_len1024 = {
1024, twiddleCoef_1024_q31, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
};
const arm_cfft_instance_q31 arm_cfft_sR_q31_len2048 = {
2048, twiddleCoef_2048_q31, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
};
const arm_cfft_instance_q31 arm_cfft_sR_q31_len4096 = {
4096, twiddleCoef_4096_q31, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
};
const arm_cfft_instance_q15 arm_cfft_sR_q15_len16 = {
16, twiddleCoef_16_q15, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
};
const arm_cfft_instance_q15 arm_cfft_sR_q15_len32 = {
32, twiddleCoef_32_q15, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
};
const arm_cfft_instance_q15 arm_cfft_sR_q15_len64 = {
64, twiddleCoef_64_q15, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
};
const arm_cfft_instance_q15 arm_cfft_sR_q15_len128 = {
128, twiddleCoef_128_q15, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
};
const arm_cfft_instance_q15 arm_cfft_sR_q15_len256 = {
256, twiddleCoef_256_q15, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
};
const arm_cfft_instance_q15 arm_cfft_sR_q15_len512 = {
512, twiddleCoef_512_q15, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
};
const arm_cfft_instance_q15 arm_cfft_sR_q15_len1024 = {
1024, twiddleCoef_1024_q15, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
};
const arm_cfft_instance_q15 arm_cfft_sR_q15_len2048 = {
2048, twiddleCoef_2048_q15, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
};
const arm_cfft_instance_q15 arm_cfft_sR_q15_len4096 = {
4096, twiddleCoef_4096_q15, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
};
/* Structure for real-value inputs */
/* Floating-point structs */
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len32 = {
{ 16, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_16_TABLE_LENGTH },
32U,
(float32_t *)twiddleCoef_rfft_32
};
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len64 = {
{ 32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE_32_TABLE_LENGTH },
64U,
(float32_t *)twiddleCoef_rfft_64
};
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len128 = {
{ 64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE_64_TABLE_LENGTH },
128U,
(float32_t *)twiddleCoef_rfft_128
};
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len256 = {
{ 128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH },
256U,
(float32_t *)twiddleCoef_rfft_256
};
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len512 = {
{ 256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH },
512U,
(float32_t *)twiddleCoef_rfft_512
};
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len1024 = {
{ 512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH },
1024U,
(float32_t *)twiddleCoef_rfft_1024
};
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len2048 = {
{ 1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE_1024_TABLE_LENGTH },
2048U,
(float32_t *)twiddleCoef_rfft_2048
};
const arm_rfft_fast_instance_f32 arm_rfft_fast_sR_f32_len4096 = {
{ 2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE_2048_TABLE_LENGTH },
4096U,
(float32_t *)twiddleCoef_rfft_4096
};
/* Fixed-point structs */
/* q31_t */
extern const q31_t realCoefAQ31[8192];
extern const q31_t realCoefBQ31[8192];
const arm_rfft_instance_q31 arm_rfft_sR_q31_len32 = {
32U,
0,
1,
256U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len16
};
const arm_rfft_instance_q31 arm_rfft_sR_q31_len64 = {
64U,
0,
1,
128U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len32
};
const arm_rfft_instance_q31 arm_rfft_sR_q31_len128 = {
128U,
0,
1,
64U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len64
};
const arm_rfft_instance_q31 arm_rfft_sR_q31_len256 = {
256U,
0,
1,
32U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len128
};
const arm_rfft_instance_q31 arm_rfft_sR_q31_len512 = {
512U,
0,
1,
16U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len256
};
const arm_rfft_instance_q31 arm_rfft_sR_q31_len1024 = {
1024U,
0,
1,
8U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len512
};
const arm_rfft_instance_q31 arm_rfft_sR_q31_len2048 = {
2048U,
0,
1,
4U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len1024
};
const arm_rfft_instance_q31 arm_rfft_sR_q31_len4096 = {
4096U,
0,
1,
2U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len2048
};
const arm_rfft_instance_q31 arm_rfft_sR_q31_len8192 = {
8192U,
0,
1,
1U,
(q31_t*)realCoefAQ31,
(q31_t*)realCoefBQ31,
&arm_cfft_sR_q31_len4096
};
/* q15_t */
extern const q15_t realCoefAQ15[8192];
extern const q15_t realCoefBQ15[8192];
const arm_rfft_instance_q15 arm_rfft_sR_q15_len32 = {
32U,
0,
1,
256U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len16
};
const arm_rfft_instance_q15 arm_rfft_sR_q15_len64 = {
64U,
0,
1,
128U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len32
};
const arm_rfft_instance_q15 arm_rfft_sR_q15_len128 = {
128U,
0,
1,
64U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len64
};
const arm_rfft_instance_q15 arm_rfft_sR_q15_len256 = {
256U,
0,
1,
32U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len128
};
const arm_rfft_instance_q15 arm_rfft_sR_q15_len512 = {
512U,
0,
1,
16U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len256
};
const arm_rfft_instance_q15 arm_rfft_sR_q15_len1024 = {
1024U,
0,
1,
8U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len512
};
const arm_rfft_instance_q15 arm_rfft_sR_q15_len2048 = {
2048U,
0,
1,
4U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len1024
};
const arm_rfft_instance_q15 arm_rfft_sR_q15_len4096 = {
4096U,
0,
1,
2U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len2048
};
const arm_rfft_instance_q15 arm_rfft_sR_q15_len8192 = {
8192U,
0,
1,
1U,
(q15_t*)realCoefAQ15,
(q15_t*)realCoefBQ15,
&arm_cfft_sR_q15_len4096
};

View File

@@ -0,0 +1,120 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_const_structs_f16.c
* Description: Constant structs that are initialized for user convenience.
* For example, some can be given as arguments to the arm_cfft_f32() or arm_rfft_f32() functions.
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math_types_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
#include "arm_const_structs_f16.h"
/*
ALLOW TABLE is true when config table is enabled and the Tramsform folder is included
for compilation.
*/
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
/* Floating-point structs */
#if !defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_AUTOVECTORIZE)
/*
Those structures cannot be used to initialize the MVE version of the FFT F32 instances.
So they are not compiled when MVE is defined.
For the MVE version, the new arm_cfft_init_f16 must be used.
*/
#if !defined(__CC_ARM)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len16 = {
16, twiddleCoefF16_16, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len32 = {
32, twiddleCoefF16_32, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len64 = {
64, twiddleCoefF16_64, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len128 = {
128, twiddleCoefF16_128, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len256 = {
256, twiddleCoefF16_256, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len512 = {
512, twiddleCoefF16_512, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len1024 = {
1024, twiddleCoefF16_1024, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len2048 = {
2048, twiddleCoefF16_2048, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
};
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
const arm_cfft_instance_f16 arm_cfft_sR_f16_len4096 = {
4096, twiddleCoefF16_4096, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
};
#endif
#endif
#endif /* !defined(ARM_MATH_MVEF) || defined(ARM_MATH_AUTOVECTORIZE) */
#endif
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,74 @@
cmake_minimum_required (VERSION 3.14)
project(CMSISDSPComplexMath)
include(configLib)
include(configDsp)
add_library(CMSISDSPComplexMath STATIC)
configLib(CMSISDSPComplexMath ${ROOT})
configDsp(CMSISDSPComplexMath ${ROOT})
include(interpol)
interpol(CMSISDSPComplexMath)
if (CONFIGTABLE AND ALLFAST)
target_compile_definitions(CMSISDSPComplexMath PUBLIC ARM_ALL_FAST_TABLES)
endif()
# Vectorized code is defining sqrt
# so fast tables required even if Fast Math not built.
if (CONFIGTABLE AND (HELIUM OR MVEF OR MVEI))
target_compile_definitions(CMSISDSPComplexMath PUBLIC ARM_FAST_ALLOW_TABLES)
endif()
# MVE code is using a table for computing the fast sqrt arm_cmplx_mag_q31
# There is the possibility of not compiling this function and not including
# the table.
if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_Q31 OR (NOT HELIUM AND NOT MVEI))
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_q31.c)
endif()
if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_Q15 OR (NOT HELIUM AND NOT MVEI))
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_q15.c)
endif()
if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_FAST_Q15 OR (NOT HELIUM AND NOT MVEI))
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_fast_q15.c)
endif()
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_f32.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_q15.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_q31.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_f32.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_q15.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_q31.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_f32.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_f64.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_f32.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_f64.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_q15.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_q31.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_f32.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_f64.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_q15.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_q31.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_f32.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_q15.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_q31.c)
if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_f16.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_f16.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_f16.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_f16.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_f16.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_f16.c)
endif()
### Includes
target_include_directories(CMSISDSPComplexMath PUBLIC "${DSP}/Include")

View File

@@ -0,0 +1,66 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: CompexMathFunctions.c
* Description: Combination of all comlex math function source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_cmplx_conj_f32.c"
#include "arm_cmplx_conj_q15.c"
#include "arm_cmplx_conj_q31.c"
#include "arm_cmplx_dot_prod_f32.c"
#include "arm_cmplx_dot_prod_q15.c"
#include "arm_cmplx_dot_prod_q31.c"
#include "arm_cmplx_mag_f32.c"
#include "arm_cmplx_mag_f64.c"
#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)) && !defined(ARM_MATH_AUTOVECTORIZE)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
#include "arm_cmplx_mag_q15.c"
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
#include "arm_cmplx_mag_fast_q15.c"
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
#include "arm_cmplx_mag_q31.c"
#endif
#else
#include "arm_cmplx_mag_q15.c"
#include "arm_cmplx_mag_fast_q15.c"
#include "arm_cmplx_mag_q31.c"
#endif
#include "arm_cmplx_mag_squared_f32.c"
#include "arm_cmplx_mag_squared_f64.c"
#include "arm_cmplx_mag_squared_q15.c"
#include "arm_cmplx_mag_squared_q31.c"
#include "arm_cmplx_mult_cmplx_f32.c"
#include "arm_cmplx_mult_cmplx_f64.c"
#include "arm_cmplx_mult_cmplx_q15.c"
#include "arm_cmplx_mult_cmplx_q31.c"
#include "arm_cmplx_mult_real_f32.c"
#include "arm_cmplx_mult_real_q15.c"
#include "arm_cmplx_mult_real_q31.c"

View File

@@ -0,0 +1,32 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: CompexMathFunctionsF16.c
* Description: Combination of all complex math function f16 source files.
*
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_cmplx_conj_f16.c"
#include "arm_cmplx_dot_prod_f16.c"
#include "arm_cmplx_mag_f16.c"
#include "arm_cmplx_mag_squared_f16.c"
#include "arm_cmplx_mult_cmplx_f16.c"
#include "arm_cmplx_mult_real_f16.c"

View File

@@ -0,0 +1,185 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_conj_f16.c
* Description: Floating-point complex conjugate
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_conj Complex Conjugate
Conjugates the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the destination data where the result should be written.
<code>numSamples</code> specifies the number of complex samples
and the data in each array is stored in an interleaved fashion
(real, imag, real, imag, ...).
Each array has a total of <code>2*numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[(2*n) ] = pSrc[(2*n) ]; // real part
pDst[(2*n)+1] = -pSrc[(2*n)+1]; // imag part
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_conj
@{
*/
/**
@brief Floating-point complex conjugate.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_conj_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t numSamples)
{
static const float16_t cmplx_conj_sign[8] = { 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f };
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
uint32_t blkCnt;
f16x8_t vecSrc;
f16x8_t vecSign;
/*
* load sign vector
*/
vecSign = *(f16x8_t *) cmplx_conj_sign;
/* Compute 4 real samples at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrc);
vst1q(pDst,vmulq(vecSrc, vecSign));
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrc += 8;
pDst += 8;
blkCnt--;
}
/* Tail */
blkCnt = (blockSize & 0x7) >> 1;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_conj_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_conj group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,171 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_conj_f32.c
* Description: Floating-point complex conjugate
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
*/
/**
* @defgroup cmplx_conj Complex Conjugate
*
* Conjugates the elements of a complex data vector.
*
* The <code>pSrc</code> points to the source data and
* <code>pDst</code> points to the where the result should be written.
* <code>numSamples</code> specifies the number of complex samples
* and the data in each array is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* Each array has a total of <code>2*numSamples</code> values.
* The underlying algorithm is used:
*
* <pre>
* for(n=0; n<numSamples; n++) {
* pDst[(2*n)+0)] = pSrc[(2*n)+0]; // real part
* pDst[(2*n)+1)] = -pSrc[(2*n)+1]; // imag part
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup cmplx_conj
* @{
*/
/**
* @brief Floating-point complex conjugate.
* @param *pSrc points to the input vector
* @param *pDst points to the output vector
* @param numSamples number of complex samples in each vector
* @return none.
*/
void arm_cmplx_conj_f32(
float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* loop counter */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t inR1, inR2, inR3, inR4;
float32_t inI1, inI2, inI3, inI4;
/*loop Unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
/* read real input samples */
inR1 = pSrc[0];
/* store real samples to destination */
pDst[0] = inR1;
inR2 = pSrc[2];
pDst[2] = inR2;
inR3 = pSrc[4];
pDst[4] = inR3;
inR4 = pSrc[6];
pDst[6] = inR4;
/* read imaginary input samples */
inI1 = pSrc[1];
inI2 = pSrc[3];
/* conjugate input */
inI1 = -inI1;
/* read imaginary input samples */
inI3 = pSrc[5];
/* conjugate input */
inI2 = -inI2;
/* read imaginary input samples */
inI4 = pSrc[7];
/* conjugate input */
inI3 = -inI3;
/* store imaginary samples to destination */
pDst[1] = inI1;
pDst[3] = inI2;
/* conjugate input */
inI4 = -inI4;
/* store imaginary samples to destination */
pDst[5] = inI3;
/* increment source pointer by 8 to process next sampels */
pSrc += 8U;
/* store imaginary sample to destination */
pDst[7] = inI4;
/* increment destination pointer by 8 to store next samples */
pDst += 8U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* realOut + j (imagOut) = realIn + j (-1) imagIn */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of cmplx_conj group
*/

View File

@@ -0,0 +1,149 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_conj_q15.c
* Description: Q15 complex conjugate
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_conj
* @{
*/
/**
* @brief Q15 complex conjugate.
* @param *pSrc points to the input vector
* @param *pDst points to the output vector
* @param numSamples number of complex samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
*/
void arm_cmplx_conj_q15(
q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
{
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
q31_t in1, in2, in3, in4;
q31_t zero = 0;
/*loop Unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
in1 = *__SIMD32(pSrc)++;
in2 = *__SIMD32(pSrc)++;
in3 = *__SIMD32(pSrc)++;
in4 = *__SIMD32(pSrc)++;
#ifndef ARM_MATH_BIG_ENDIAN
in1 = __QASX(zero, in1);
in2 = __QASX(zero, in2);
in3 = __QASX(zero, in3);
in4 = __QASX(zero, in4);
#else
in1 = __QSAX(zero, in1);
in2 = __QSAX(zero, in2);
in3 = __QSAX(zero, in3);
in4 = __QSAX(zero, in4);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
in1 = ((uint32_t) in1 >> 16) | ((uint32_t) in1 << 16);
in2 = ((uint32_t) in2 >> 16) | ((uint32_t) in2 << 16);
in3 = ((uint32_t) in3 >> 16) | ((uint32_t) in3 << 16);
in4 = ((uint32_t) in4 >> 16) | ((uint32_t) in4 << 16);
*__SIMD32(pDst)++ = in1;
*__SIMD32(pDst)++ = in2;
*__SIMD32(pDst)++ = in3;
*__SIMD32(pDst)++ = in4;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = __SSAT(-*pSrc++, 16);
/* Decrement the loop counter */
blkCnt--;
}
#else
q15_t in;
/* Run the below code for Cortex-M0 */
while (numSamples > 0U)
{
/* realOut + j (imagOut) = realIn+ j (-1) imagIn */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
*pDst++ = *pSrc++;
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? 0x7fff : -in;
/* Decrement the loop counter */
numSamples--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of cmplx_conj group
*/

View File

@@ -0,0 +1,169 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_conj_q31.c
* Description: Q31 complex conjugate
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_conj
* @{
*/
/**
* @brief Q31 complex conjugate.
* @param *pSrc points to the input vector
* @param *pDst points to the output vector
* @param numSamples number of complex samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
void arm_cmplx_conj_q31(
q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* loop counter */
q31_t in; /* Input value */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inR1, inR2, inR3, inR4; /* Temporary real variables */
q31_t inI1, inI2, inI3, inI4; /* Temporary imaginary variables */
/*loop Unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
/* Saturated to 0x7fffffff if the input is -1(0x80000000) */
/* read real input sample */
inR1 = pSrc[0];
/* store real input sample */
pDst[0] = inR1;
/* read imaginary input sample */
inI1 = pSrc[1];
/* read real input sample */
inR2 = pSrc[2];
/* store real input sample */
pDst[2] = inR2;
/* read imaginary input sample */
inI2 = pSrc[3];
/* negate imaginary input sample */
inI1 = __QSUB(0, inI1);
/* read real input sample */
inR3 = pSrc[4];
/* store real input sample */
pDst[4] = inR3;
/* read imaginary input sample */
inI3 = pSrc[5];
/* negate imaginary input sample */
inI2 = __QSUB(0, inI2);
/* read real input sample */
inR4 = pSrc[6];
/* store real input sample */
pDst[6] = inR4;
/* negate imaginary input sample */
inI3 = __QSUB(0, inI3);
/* store imaginary input sample */
inI4 = pSrc[7];
/* store imaginary input samples */
pDst[1] = inI1;
/* negate imaginary input sample */
inI4 = __QSUB(0, inI4);
/* store imaginary input samples */
pDst[3] = inI2;
/* increment source pointer by 8 to proecess next samples */
pSrc += 8U;
/* store imaginary input samples */
pDst[5] = inI3;
pDst[7] = inI4;
/* increment destination pointer by 8 to process next samples */
pDst += 8U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
/* Saturated to 0x7fffffff if the input is -1(0x80000000) */
*pDst++ = *pSrc++;
in = *pSrc++;
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
/* Decrement the loop counter */
blkCnt--;
}
}
/**
* @} end of cmplx_conj group
*/

View File

@@ -0,0 +1,288 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_dot_prod_f16.c
* Description: Floating-point complex dot product
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_dot_prod Complex Dot Product
Computes the dot product of two complex vectors.
The vectors are multiplied element-by-element and then summed.
The <code>pSrcA</code> points to the first complex input vector and
<code>pSrcB</code> points to the second complex input vector.
<code>numSamples</code> specifies the number of complex samples
and the data in each array is stored in an interleaved fashion
(real, imag, real, imag, ...).
Each array has a total of <code>2*numSamples</code> values.
The underlying algorithm is used:
<pre>
realResult = 0;
imagResult = 0;
for (n = 0; n < numSamples; n++) {
realResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
imagResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_dot_prod
@{
*/
/**
@brief Floating-point complex dot product.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] numSamples number of samples in each vector
@param[out] realResult real part of the result returned here
@param[out] imagResult imaginary part of the result returned here
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_cmplx_dot_prod_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
uint32_t numSamples,
float16_t * realResult,
float16_t * imagResult)
{
int32_t blkCnt;
float16_t real_sum, imag_sum;
f16x8_t vecSrcA, vecSrcB;
f16x8_t vec_acc = vdupq_n_f16(0.0f16);
f16x8_t vecSrcC, vecSrcD;
blkCnt = (numSamples >> 3);
blkCnt -= 1;
if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
vecSrcA = vld1q( pSrcA);
vecSrcB = vld1q( pSrcB);
pSrcA += 8;
pSrcB += 8;
while (blkCnt > 0) {
vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 8;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 8;
vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 8;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
/*
* tail
*/
blkCnt = CMPLX_DIM * (numSamples & 7);
while (blkCnt > 0) {
mve_pred16_t p = vctp16q(blkCnt);
pSrcA += 8;
pSrcB += 8;
vecSrcA = vldrhq_z_f16(pSrcA, p);
vecSrcB = vldrhq_z_f16(pSrcB, p);
vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
blkCnt -= 8;
}
} else {
/* small vector */
blkCnt = numSamples * CMPLX_DIM;
vec_acc = vdupq_n_f16(0.0f16);
do {
mve_pred16_t p = vctp16q(blkCnt);
vecSrcA = vldrhq_z_f16(pSrcA, p);
vecSrcB = vldrhq_z_f16(pSrcB, p);
vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
}
/* Sum the partial parts */
mve_cmplx_sum_intra_r_i_f16(vec_acc, real_sum, imag_sum);
/*
* Store the real and imaginary results in the destination buffers
*/
*realResult = real_sum;
*imagResult = imag_sum;
}
#else
void arm_cmplx_dot_prod_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
uint32_t numSamples,
float16_t * realResult,
float16_t * imagResult)
{
uint32_t blkCnt; /* Loop counter */
_Float16 real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result variables */
_Float16 a0,b0,c0,d0;
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement loop counter */
blkCnt--;
}
/* Store real and imaginary result in destination buffer. */
*realResult = real_sum;
*imagResult = imag_sum;
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_dot_prod group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,191 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_dot_prod_f32.c
* Description: Floating-point complex dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
*/
/**
* @defgroup cmplx_dot_prod Complex Dot Product
*
* Computes the dot product of two complex vectors.
* The vectors are multiplied element-by-element and then summed.
*
* The <code>pSrcA</code> points to the first complex input vector and
* <code>pSrcB</code> points to the second complex input vector.
* <code>numSamples</code> specifies the number of complex samples
* and the data in each array is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* Each array has a total of <code>2*numSamples</code> values.
*
* The underlying algorithm is used:
* <pre>
* realResult=0;
* imagResult=0;
* for(n=0; n<numSamples; n++) {
* realResult += pSrcA[(2*n)+0]*pSrcB[(2*n)+0] - pSrcA[(2*n)+1]*pSrcB[(2*n)+1];
* imagResult += pSrcA[(2*n)+0]*pSrcB[(2*n)+1] + pSrcA[(2*n)+1]*pSrcB[(2*n)+0];
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup cmplx_dot_prod
* @{
*/
/**
* @brief Floating-point complex dot product
* @param *pSrcA points to the first input vector
* @param *pSrcB points to the second input vector
* @param numSamples number of complex samples in each vector
* @param *realResult real part of the result returned here
* @param *imagResult imaginary part of the result returned here
* @return none.
*/
void arm_cmplx_dot_prod_f32(
float32_t * pSrcA,
float32_t * pSrcB,
uint32_t numSamples,
float32_t * realResult,
float32_t * imagResult)
{
float32_t real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result storage */
float32_t a0,b0,c0,d0;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
/*loop Unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples & 0x3U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
while (numSamples > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement the loop counter */
numSamples--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the real and imaginary results in the destination buffers */
*realResult = real_sum;
*imagResult = imag_sum;
}
/**
* @} end of cmplx_dot_prod group
*/

View File

@@ -0,0 +1,177 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_dot_prod_q15.c
* Description: Processing function for the Q15 Complex Dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_dot_prod
* @{
*/
/**
* @brief Q15 complex dot product
* @param *pSrcA points to the first input vector
* @param *pSrcB points to the second input vector
* @param numSamples number of complex samples in each vector
* @param *realResult real part of the result returned here
* @param *imagResult imaginary part of the result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function is implemented using an internal 64-bit accumulator.
* The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
* These are accumulated in a 64-bit accumulator with 34.30 precision.
* As a final step, the accumulators are converted to 8.24 format.
* The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
*/
void arm_cmplx_dot_prod_q15(
q15_t * pSrcA,
q15_t * pSrcB,
uint32_t numSamples,
q31_t * realResult,
q31_t * imagResult)
{
q63_t real_sum = 0, imag_sum = 0; /* Temporary result storage */
q15_t a0,b0,c0,d0;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
/*loop Unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
while (numSamples > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement the loop counter */
numSamples--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the real and imaginary results in 8.24 format */
/* Convert real data in 34.30 to 8.24 by 6 right shifts */
*realResult = (q31_t) (real_sum >> 6);
/* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
*imagResult = (q31_t) (imag_sum >> 6);
}
/**
* @} end of cmplx_dot_prod group
*/

View File

@@ -0,0 +1,175 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_dot_prod_q31.c
* Description: Q31 complex dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_dot_prod
* @{
*/
/**
* @brief Q31 complex dot product
* @param *pSrcA points to the first input vector
* @param *pSrcB points to the second input vector
* @param numSamples number of complex samples in each vector
* @param *realResult real part of the result returned here
* @param *imagResult imaginary part of the result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function is implemented using an internal 64-bit accumulator.
* The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
* The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
* Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
* The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
* Input down scaling is not required.
*/
void arm_cmplx_dot_prod_q31(
q31_t * pSrcA,
q31_t * pSrcB,
uint32_t numSamples,
q63_t * realResult,
q63_t * imagResult)
{
q63_t real_sum = 0, imag_sum = 0; /* Temporary result storage */
q31_t a0,b0,c0,d0;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
/*loop Unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
while (numSamples > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
/* Decrement the loop counter */
numSamples--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the real and imaginary results in 16.48 format */
*realResult = real_sum;
*imagResult = imag_sum;
}
/**
* @} end of cmplx_dot_prod group
*/

View File

@@ -0,0 +1,241 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mag_f16.c
* Description: Floating-point complex magnitude
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_mag Complex Magnitude
Computes the magnitude of the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the where the result should be written.
<code>numSamples</code> specifies the number of complex samples
in the input array and the data is stored in an interleaved fashion
(real, imag, real, imag, ...).
The input array has a total of <code>2*numSamples</code> values;
the output array has a total of <code>numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_mag
@{
*/
/**
@brief Floating-point complex magnitude.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_cmplx_mag_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t numSamples)
{
int32_t blockSize = numSamples; /* loop counters */
uint32_t blkCnt; /* loop counters */
f16x8x2_t vecSrc;
f16x8_t sum;
/* Compute 4 complex samples at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
q15x8_t newtonStartVec;
f16x8_t sumHalf, invSqrt;
vecSrc = vld2q(pSrc);
pSrc += 16;
sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
/*
* inlined Fast SQRT using inverse SQRT newton-raphson method
*/
/* compute initial value */
newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
sumHalf = sum * 0.5f;
/*
* compute 3 x iterations
*
* The more iterations, the more accuracy.
* If you need to trade a bit of accuracy for more performance,
* you can comment out the 3rd use of the macro.
*/
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
/*
* set negative values to 0
*/
invSqrt = vdupq_m(invSqrt, (float16_t)0.0f, vcmpltq(invSqrt, (float16_t)0.0f));
/*
* sqrt(x) = x * invSqrt(x)
*/
sum = vmulq(sum, invSqrt);
vstrhq_f16(pDst, sum);
pDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
q15x8_t newtonStartVec;
f16x8_t sumHalf, invSqrt;
vecSrc = vld2q((float16_t const *)pSrc);
sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
/*
* inlined Fast SQRT using inverse SQRT newton-raphson method
*/
/* compute initial value */
newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
sumHalf = vmulq(sum, (float16_t)0.5);
/*
* compute 2 x iterations
*/
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
/*
* set negative values to 0
*/
invSqrt = vdupq_m(invSqrt, (float16_t)0.0, vcmpltq(invSqrt, (float16_t)0.0));
/*
* sqrt(x) = x * invSqrt(x)
*/
sum = vmulq(sum, invSqrt);
vstrhq_p_f16(pDst, sum, p0);
}
}
#else
void arm_cmplx_mag_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* loop counter */
_Float16 real, imag; /* Temporary variables to hold input values */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_mag group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,153 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mag_f32.c
* Description: Floating-point complex magnitude
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_math.h"
/**
* @ingroup groupCmplxMath
*/
/**
* @defgroup cmplx_mag Complex Magnitude
*
* Computes the magnitude of the elements of a complex data vector.
*
* The <code>pSrc</code> points to the source data and
* <code>pDst</code> points to the where the result should be written.
* <code>numSamples</code> specifies the number of complex samples
* in the input array and the data is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* The input array has a total of <code>2*numSamples</code> values;
* the output array has a total of <code>numSamples</code> values.
* The underlying algorithm is used:
*
* <pre>
* for(n=0; n<numSamples; n++) {
* pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup cmplx_mag
* @{
*/
/**
* @brief Floating-point complex magnitude.
* @param[in] *pSrc points to complex input buffer
* @param[out] *pDst points to real output buffer
* @param[in] numSamples number of complex samples in the input vector
* @return none.
*
*/
void arm_cmplx_mag_f32(
float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
float32_t realIn, imagIn; /* Temporary variables to hold input values */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
/*loop Unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
realIn = *pSrc++;
imagIn = *pSrc++;
/* store the result in the destination buffer. */
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
realIn = *pSrc++;
imagIn = *pSrc++;
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
realIn = *pSrc++;
imagIn = *pSrc++;
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
realIn = *pSrc++;
imagIn = *pSrc++;
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
realIn = *pSrc++;
imagIn = *pSrc++;
/* store the result in the destination buffer. */
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
while (numSamples > 0U)
{
/* out = sqrt((real * real) + (imag * imag)) */
realIn = *pSrc++;
imagIn = *pSrc++;
/* store the result in the destination buffer. */
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
/* Decrement the loop counter */
numSamples--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
* @} end of cmplx_mag group
*/

View File

@@ -0,0 +1,100 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mag_f64.c
* Description: Floating-point complex magnitude
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions.h"
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_mag Complex Magnitude
Computes the magnitude of the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the where the result should be written.
<code>numSamples</code> specifies the number of complex samples
in the input array and the data is stored in an interleaved fashion
(real, imag, real, imag, ...).
The input array has a total of <code>2*numSamples</code> values;
the output array has a total of <code>numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_mag
@{
*/
/**
@brief Floating-point complex magnitude.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
void arm_cmplx_mag_f64(
const float64_t * pSrc,
float64_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* loop counter */
float64_t real, imag; /* Temporary variables to hold input values */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
*pDst++ = sqrt((real * real) + (imag * imag));
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of cmplx_mag group
*/

Some files were not shown because too many files have changed in this diff Show More