инициализация spi перенесена на cube (работает)

убрана кастомная библиотека periph_general
This commit is contained in:
2025-06-27 15:28:58 +03:00
parent 981dbf9bfa
commit a9a2466359
1587 changed files with 403396 additions and 250770 deletions

View File

@@ -0,0 +1,74 @@
cmake_minimum_required (VERSION 3.14)
project(CMSISDSPComplexMath)
include(configLib)
include(configDsp)
add_library(CMSISDSPComplexMath STATIC)
configLib(CMSISDSPComplexMath ${ROOT})
configDsp(CMSISDSPComplexMath ${ROOT})
include(interpol)
interpol(CMSISDSPComplexMath)
if (CONFIGTABLE AND ALLFAST)
target_compile_definitions(CMSISDSPComplexMath PUBLIC ARM_ALL_FAST_TABLES)
endif()
# Vectorized code is defining sqrt
# so fast tables required even if Fast Math not built.
if (CONFIGTABLE AND (HELIUM OR MVEF OR MVEI))
target_compile_definitions(CMSISDSPComplexMath PUBLIC ARM_FAST_ALLOW_TABLES)
endif()
# MVE code is using a table for computing the fast sqrt arm_cmplx_mag_q31
# There is the possibility of not compiling this function and not including
# the table.
if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_Q31 OR (NOT HELIUM AND NOT MVEI))
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_q31.c)
endif()
if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_Q15 OR (NOT HELIUM AND NOT MVEI))
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_q15.c)
endif()
if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_FAST_Q15 OR (NOT HELIUM AND NOT MVEI))
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_fast_q15.c)
endif()
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_f32.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_q15.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_q31.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_f32.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_q15.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_q31.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_f32.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_f64.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_f32.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_f64.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_q15.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_q31.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_f32.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_f64.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_q15.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_q31.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_f32.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_q15.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_q31.c)
if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_f16.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_f16.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_f16.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_f16.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_f16.c)
target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_f16.c)
endif()
### Includes
target_include_directories(CMSISDSPComplexMath PUBLIC "${DSP}/Include")

View File

@@ -0,0 +1,66 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: CompexMathFunctions.c
* Description: Combination of all comlex math function source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_cmplx_conj_f32.c"
#include "arm_cmplx_conj_q15.c"
#include "arm_cmplx_conj_q31.c"
#include "arm_cmplx_dot_prod_f32.c"
#include "arm_cmplx_dot_prod_q15.c"
#include "arm_cmplx_dot_prod_q31.c"
#include "arm_cmplx_mag_f32.c"
#include "arm_cmplx_mag_f64.c"
#if (defined (ARM_MATH_HELIUM) || defined(ARM_MATH_MVEI)) && !defined(ARM_MATH_AUTOVECTORIZE)
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
#include "arm_cmplx_mag_q15.c"
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
#include "arm_cmplx_mag_fast_q15.c"
#endif
#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
#include "arm_cmplx_mag_q31.c"
#endif
#else
#include "arm_cmplx_mag_q15.c"
#include "arm_cmplx_mag_fast_q15.c"
#include "arm_cmplx_mag_q31.c"
#endif
#include "arm_cmplx_mag_squared_f32.c"
#include "arm_cmplx_mag_squared_f64.c"
#include "arm_cmplx_mag_squared_q15.c"
#include "arm_cmplx_mag_squared_q31.c"
#include "arm_cmplx_mult_cmplx_f32.c"
#include "arm_cmplx_mult_cmplx_f64.c"
#include "arm_cmplx_mult_cmplx_q15.c"
#include "arm_cmplx_mult_cmplx_q31.c"
#include "arm_cmplx_mult_real_f32.c"
#include "arm_cmplx_mult_real_q15.c"
#include "arm_cmplx_mult_real_q31.c"

View File

@@ -0,0 +1,32 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: CompexMathFunctionsF16.c
* Description: Combination of all complex math function f16 source files.
*
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_cmplx_conj_f16.c"
#include "arm_cmplx_dot_prod_f16.c"
#include "arm_cmplx_mag_f16.c"
#include "arm_cmplx_mag_squared_f16.c"
#include "arm_cmplx_mult_cmplx_f16.c"
#include "arm_cmplx_mult_real_f16.c"

View File

@@ -0,0 +1,185 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_conj_f16.c
* Description: Floating-point complex conjugate
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_conj Complex Conjugate
Conjugates the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the destination data where the result should be written.
<code>numSamples</code> specifies the number of complex samples
and the data in each array is stored in an interleaved fashion
(real, imag, real, imag, ...).
Each array has a total of <code>2*numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[(2*n) ] = pSrc[(2*n) ]; // real part
pDst[(2*n)+1] = -pSrc[(2*n)+1]; // imag part
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_conj
@{
*/
/**
@brief Floating-point complex conjugate.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_conj_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t numSamples)
{
static const float16_t cmplx_conj_sign[8] = { 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f };
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
uint32_t blkCnt;
f16x8_t vecSrc;
f16x8_t vecSign;
/*
* load sign vector
*/
vecSign = *(f16x8_t *) cmplx_conj_sign;
/* Compute 4 real samples at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrc);
vst1q(pDst,vmulq(vecSrc, vecSign));
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrc += 8;
pDst += 8;
blkCnt--;
}
/* Tail */
blkCnt = (blockSize & 0x7) >> 1;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -(_Float16)*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_conj_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = *pSrc++;
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = *pSrc++;
*pDst++ = -(_Float16)*pSrc++;
*pDst++ = *pSrc++;
*pDst++ = -(_Float16)*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -(_Float16)*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_conj group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_conj_f32.c
* Description: Floating-point complex conjugate
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,146 +26,188 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @defgroup cmplx_conj Complex Conjugate
*
* Conjugates the elements of a complex data vector.
*
* The <code>pSrc</code> points to the source data and
* <code>pDst</code> points to the where the result should be written.
* <code>numSamples</code> specifies the number of complex samples
* and the data in each array is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* Each array has a total of <code>2*numSamples</code> values.
* The underlying algorithm is used:
*
* <pre>
* for(n=0; n<numSamples; n++) {
* pDst[(2*n)+0)] = pSrc[(2*n)+0]; // real part
* pDst[(2*n)+1)] = -pSrc[(2*n)+1]; // imag part
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
@defgroup cmplx_conj Complex Conjugate
Conjugates the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the destination data where the result should be written.
<code>numSamples</code> specifies the number of complex samples
and the data in each array is stored in an interleaved fashion
(real, imag, real, imag, ...).
Each array has a total of <code>2*numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[(2*n) ] = pSrc[(2*n) ]; // real part
pDst[(2*n)+1] = -pSrc[(2*n)+1]; // imag part
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup cmplx_conj
* @{
@addtogroup cmplx_conj
@{
*/
/**
* @brief Floating-point complex conjugate.
* @param *pSrc points to the input vector
* @param *pDst points to the output vector
* @param numSamples number of complex samples in each vector
* @return none.
@brief Floating-point complex conjugate.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_conj_f32(
float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* loop counter */
static const float32_t cmplx_conj_sign[4] = { 1.0f, -1.0f, 1.0f, -1.0f };
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
uint32_t blkCnt;
f32x4_t vecSrc;
f32x4_t vecSign;
#if defined (ARM_MATH_DSP)
/*
* load sign vector
*/
vecSign = *(f32x4_t *) cmplx_conj_sign;
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t inR1, inR2, inR3, inR4;
float32_t inI1, inI2, inI3, inI4;
/* Compute 4 real samples at a time */
blkCnt = blockSize >> 2U;
/*loop Unrolling */
while (blkCnt > 0U)
{
vecSrc = vld1q(pSrc);
vst1q(pDst,vmulq(vecSrc, vecSign));
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrc += 4;
pDst += 4;
blkCnt--;
}
/* Tail */
blkCnt = (blockSize & 0x3) >> 1;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_conj_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4_t zero;
float32x4x2_t vec;
zero = vdupq_n_f32(0.0f);
/* Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+(-1)*jA[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
vec = vld2q_f32(pSrc);
vec.val[1] = vsubq_f32(zero,vec.val[1]);
vst2q_f32(pDst,vec);
/* Increment pointers */
pSrc += 8;
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = numSamples & 0x3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
/* read real input samples */
inR1 = pSrc[0];
/* store real samples to destination */
pDst[0] = inR1;
inR2 = pSrc[2];
pDst[2] = inR2;
inR3 = pSrc[4];
pDst[4] = inR3;
inR4 = pSrc[6];
pDst[6] = inR4;
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* read imaginary input samples */
inI1 = pSrc[1];
inI2 = pSrc[3];
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* conjugate input */
inI1 = -inI1;
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* read imaginary input samples */
inI3 = pSrc[5];
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* conjugate input */
inI2 = -inI2;
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* read imaginary input samples */
inI4 = pSrc[7];
/* conjugate input */
inI3 = -inI3;
/* store imaginary samples to destination */
pDst[1] = inI1;
pDst[3] = inI2;
/* conjugate input */
inI4 = -inI4;
/* store imaginary samples to destination */
pDst[5] = inI3;
/* increment source pointer by 8 to process next sampels */
pSrc += 8U;
/* store imaginary sample to destination */
pDst[7] = inI4;
/* increment destination pointer by 8 to store next samples */
pDst += 8U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined (ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* realOut + j (imagOut) = realIn + j (-1) imagIn */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
*pDst++ = *pSrc++;
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = -*pSrc++;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of cmplx_conj group
@} end of cmplx_conj group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_conj_q15.c
* Description: Q15 complex conjugate
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,71 +26,118 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_conj
* @{
@addtogroup cmplx_conj
@{
*/
/**
* @brief Q15 complex conjugate.
* @param *pSrc points to the input vector
* @param *pDst points to the output vector
* @param numSamples number of complex samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.
@brief Q15 complex conjugate.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_conj_q15(
q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
{
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
uint32_t blkCnt;
q31_t in1;
#if defined (ARM_MATH_DSP)
q15x8x2_t vecSrc;
q15x8_t zero;
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
q31_t in1, in2, in3, in4;
q31_t zero = 0;
zero = vdupq_n_s16(0);
/*loop Unrolling */
/* Compute 8 real samples at a time */
blkCnt = blockSize >> 4U;
while (blkCnt > 0U)
{
vecSrc = vld2q(pSrc);
vecSrc.val[1] = vqsubq(zero, vecSrc.val[1]);
vst2q(pDst,vecSrc);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrc += 16;
pDst += 16;
blkCnt --;
}
/* Tail */
blkCnt = (blockSize & 0xF) >> 1;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
in1 = *pSrc++;
*pDst++ = __SSAT(-in1, 16);
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_conj_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
q31_t in1; /* Temporary input variable */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t in2, in3, in4; /* Temporary input variables */
#endif
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
in1 = *__SIMD32(pSrc)++;
in2 = *__SIMD32(pSrc)++;
in3 = *__SIMD32(pSrc)++;
in4 = *__SIMD32(pSrc)++;
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
in1 = read_q15x2_ia (&pSrc);
in2 = read_q15x2_ia (&pSrc);
in3 = read_q15x2_ia (&pSrc);
in4 = read_q15x2_ia (&pSrc);
#ifndef ARM_MATH_BIG_ENDIAN
in1 = __QASX(zero, in1);
in2 = __QASX(zero, in2);
in3 = __QASX(zero, in3);
in4 = __QASX(zero, in4);
in1 = __QASX(0, in1);
in2 = __QASX(0, in2);
in3 = __QASX(0, in3);
in4 = __QASX(0, in4);
#else
in1 = __QSAX(zero, in1);
in2 = __QSAX(zero, in2);
in3 = __QSAX(zero, in3);
in4 = __QSAX(zero, in4);
in1 = __QSAX(0, in1);
in2 = __QSAX(0, in2);
in3 = __QSAX(0, in3);
in4 = __QSAX(0, in4);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
in1 = ((uint32_t) in1 >> 16) | ((uint32_t) in1 << 16);
@@ -98,52 +145,63 @@ void arm_cmplx_conj_q15(
in3 = ((uint32_t) in3 >> 16) | ((uint32_t) in3 << 16);
in4 = ((uint32_t) in4 >> 16) | ((uint32_t) in4 << 16);
*__SIMD32(pDst)++ = in1;
*__SIMD32(pDst)++ = in2;
*__SIMD32(pDst)++ = in3;
*__SIMD32(pDst)++ = in4;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
*pDst++ = *pSrc++;
*pDst++ = __SSAT(-*pSrc++, 16);
/* Decrement the loop counter */
blkCnt--;
}
write_q15x2_ia (&pDst, in1);
write_q15x2_ia (&pDst, in2);
write_q15x2_ia (&pDst, in3);
write_q15x2_ia (&pDst, in4);
#else
*pDst++ = *pSrc++;
in1 = *pSrc++;
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
q15_t in;
*pDst++ = *pSrc++;
in1 = *pSrc++;
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
/* Run the below code for Cortex-M0 */
*pDst++ = *pSrc++;
in1 = *pSrc++;
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
while (numSamples > 0U)
{
/* realOut + j (imagOut) = realIn+ j (-1) imagIn */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
*pDst++ = *pSrc++;
in = *pSrc++;
*pDst++ = (in == (q15_t) 0x8000) ? 0x7fff : -in;
/* Decrement the loop counter */
numSamples--;
}
*pDst++ = *pSrc++;
in1 = *pSrc++;
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
in1 = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __SSAT(-in1, 16);
#else
*pDst++ = (in1 == (q15_t) 0x8000) ? (q15_t) 0x7fff : -in1;
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of cmplx_conj group
@} end of cmplx_conj group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_conj_q31.c
* Description: Q31 complex conjugate
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,144 +26,168 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_conj
* @{
@addtogroup cmplx_conj
@{
*/
/**
* @brief Q31 complex conjugate.
* @param *pSrc points to the input vector
* @param *pDst points to the output vector
* @param numSamples number of complex samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.
@brief Q31 complex conjugate.
@param[in] pSrc points to the input vector
@param[out] pDst points to the output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_conj_q31(
q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
const q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* loop counter */
q31_t in; /* Input value */
#if defined (ARM_MATH_DSP)
uint32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
uint32_t blkCnt;
q31x4x2_t vecSrc;
q31_t in; /* Temporary input variable */
q31x4_t zero;
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t inR1, inR2, inR3, inR4; /* Temporary real variables */
q31_t inI1, inI2, inI3, inI4; /* Temporary imaginary variables */
zero = vdupq_n_s32(0);
/*loop Unrolling */
/* Compute 4 real samples at a time */
blkCnt = blockSize >> 3U;
while (blkCnt > 0U)
{
vecSrc = vld2q(pSrc);
vecSrc.val[1] = vqsubq(zero, vecSrc.val[1]);
vst2q(pDst,vecSrc);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrc += 8;
pDst += 8;
blkCnt --;
}
/* Tail */
blkCnt = (blockSize & 0x7) >> 1;
while (blkCnt > 0U)
{
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
in = *pSrc++;
*pDst++ = __QSUB(0, in);
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_conj_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
q31_t in; /* Temporary input variable */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
/* Saturated to 0x7fffffff if the input is -1(0x80000000) */
/* read real input sample */
inR1 = pSrc[0];
/* store real input sample */
pDst[0] = inR1;
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* read imaginary input sample */
inI1 = pSrc[1];
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* read real input sample */
inR2 = pSrc[2];
/* store real input sample */
pDst[2] = inR2;
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* read imaginary input sample */
inI2 = pSrc[3];
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* negate imaginary input sample */
inI1 = __QSUB(0, inI1);
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* read real input sample */
inR3 = pSrc[4];
/* store real input sample */
pDst[4] = inR3;
/* read imaginary input sample */
inI3 = pSrc[5];
/* negate imaginary input sample */
inI2 = __QSUB(0, inI2);
/* read real input sample */
inR4 = pSrc[6];
/* store real input sample */
pDst[6] = inR4;
/* negate imaginary input sample */
inI3 = __QSUB(0, inI3);
/* store imaginary input sample */
inI4 = pSrc[7];
/* store imaginary input samples */
pDst[1] = inI1;
/* negate imaginary input sample */
inI4 = __QSUB(0, inI4);
/* store imaginary input samples */
pDst[3] = inI2;
/* increment source pointer by 8 to proecess next samples */
pSrc += 8U;
/* store imaginary input samples */
pDst[5] = inI3;
pDst[7] = inI4;
/* increment destination pointer by 8 to process next samples */
pDst += 8U;
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0]+jC[1] = A[0]+ j (-1) A[1] */
/* Calculate Complex Conjugate and then store the results in the destination buffer. */
/* Saturated to 0x7fffffff if the input is -1(0x80000000) */
*pDst++ = *pSrc++;
in = *pSrc++;
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
/* C[0] + jC[1] = A[0]+ j(-1)A[1] */
/* Decrement the loop counter */
/* Calculate Complex Conjugate and store result in destination buffer. */
*pDst++ = *pSrc++;
in = *pSrc++;
#if defined (ARM_MATH_DSP)
*pDst++ = __QSUB(0, in);
#else
*pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of cmplx_conj group
@} end of cmplx_conj group
*/

View File

@@ -0,0 +1,288 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_dot_prod_f16.c
* Description: Floating-point complex dot product
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_dot_prod Complex Dot Product
Computes the dot product of two complex vectors.
The vectors are multiplied element-by-element and then summed.
The <code>pSrcA</code> points to the first complex input vector and
<code>pSrcB</code> points to the second complex input vector.
<code>numSamples</code> specifies the number of complex samples
and the data in each array is stored in an interleaved fashion
(real, imag, real, imag, ...).
Each array has a total of <code>2*numSamples</code> values.
The underlying algorithm is used:
<pre>
realResult = 0;
imagResult = 0;
for (n = 0; n < numSamples; n++) {
realResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
imagResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_dot_prod
@{
*/
/**
@brief Floating-point complex dot product.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] numSamples number of samples in each vector
@param[out] realResult real part of the result returned here
@param[out] imagResult imaginary part of the result returned here
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_cmplx_dot_prod_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
uint32_t numSamples,
float16_t * realResult,
float16_t * imagResult)
{
int32_t blkCnt;
float16_t real_sum, imag_sum;
f16x8_t vecSrcA, vecSrcB;
f16x8_t vec_acc = vdupq_n_f16(0.0f16);
f16x8_t vecSrcC, vecSrcD;
blkCnt = (numSamples >> 3);
blkCnt -= 1;
if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
vecSrcA = vld1q( pSrcA);
vecSrcB = vld1q( pSrcB);
pSrcA += 8;
pSrcB += 8;
while (blkCnt > 0) {
vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 8;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 8;
vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 8;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
/*
* tail
*/
blkCnt = CMPLX_DIM * (numSamples & 7);
while (blkCnt > 0) {
mve_pred16_t p = vctp16q(blkCnt);
pSrcA += 8;
pSrcB += 8;
vecSrcA = vldrhq_z_f16(pSrcA, p);
vecSrcB = vldrhq_z_f16(pSrcB, p);
vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
blkCnt -= 8;
}
} else {
/* small vector */
blkCnt = numSamples * CMPLX_DIM;
vec_acc = vdupq_n_f16(0.0f16);
do {
mve_pred16_t p = vctp16q(blkCnt);
vecSrcA = vldrhq_z_f16(pSrcA, p);
vecSrcB = vldrhq_z_f16(pSrcB, p);
vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
}
/* Sum the partial parts */
mve_cmplx_sum_intra_r_i_f16(vec_acc, real_sum, imag_sum);
/*
* Store the real and imaginary results in the destination buffers
*/
*realResult = real_sum;
*imagResult = imag_sum;
}
#else
void arm_cmplx_dot_prod_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
uint32_t numSamples,
float16_t * realResult,
float16_t * imagResult)
{
uint32_t blkCnt; /* Loop counter */
_Float16 real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result variables */
_Float16 a0,b0,c0,d0;
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement loop counter */
blkCnt--;
}
/* Store real and imaginary result in destination buffer. */
*realResult = real_sum;
*imagResult = imag_sum;
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_dot_prod group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_dot_prod_f32.c
* Description: Floating-point complex dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,166 +26,315 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @defgroup cmplx_dot_prod Complex Dot Product
*
* Computes the dot product of two complex vectors.
* The vectors are multiplied element-by-element and then summed.
*
* The <code>pSrcA</code> points to the first complex input vector and
* <code>pSrcB</code> points to the second complex input vector.
* <code>numSamples</code> specifies the number of complex samples
* and the data in each array is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* Each array has a total of <code>2*numSamples</code> values.
*
* The underlying algorithm is used:
* <pre>
* realResult=0;
* imagResult=0;
* for(n=0; n<numSamples; n++) {
* realResult += pSrcA[(2*n)+0]*pSrcB[(2*n)+0] - pSrcA[(2*n)+1]*pSrcB[(2*n)+1];
* imagResult += pSrcA[(2*n)+0]*pSrcB[(2*n)+1] + pSrcA[(2*n)+1]*pSrcB[(2*n)+0];
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
@defgroup cmplx_dot_prod Complex Dot Product
Computes the dot product of two complex vectors.
The vectors are multiplied element-by-element and then summed.
The <code>pSrcA</code> points to the first complex input vector and
<code>pSrcB</code> points to the second complex input vector.
<code>numSamples</code> specifies the number of complex samples
and the data in each array is stored in an interleaved fashion
(real, imag, real, imag, ...).
Each array has a total of <code>2*numSamples</code> values.
The underlying algorithm is used:
<pre>
realResult = 0;
imagResult = 0;
for (n = 0; n < numSamples; n++) {
realResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
imagResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup cmplx_dot_prod
* @{
@addtogroup cmplx_dot_prod
@{
*/
/**
* @brief Floating-point complex dot product
* @param *pSrcA points to the first input vector
* @param *pSrcB points to the second input vector
* @param numSamples number of complex samples in each vector
* @param *realResult real part of the result returned here
* @param *imagResult imaginary part of the result returned here
* @return none.
@brief Floating-point complex dot product.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] numSamples number of samples in each vector
@param[out] realResult real part of the result returned here
@param[out] imagResult imaginary part of the result returned here
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_dot_prod_f32(
float32_t * pSrcA,
float32_t * pSrcB,
uint32_t numSamples,
float32_t * realResult,
float32_t * imagResult)
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t numSamples,
float32_t * realResult,
float32_t * imagResult)
{
float32_t real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result storage */
float32_t a0,b0,c0,d0;
int32_t blkCnt;
float32_t real_sum, imag_sum;
f32x4_t vecSrcA, vecSrcB;
f32x4_t vec_acc = vdupq_n_f32(0.0f);
f32x4_t vecSrcC, vecSrcD;
#if defined (ARM_MATH_DSP)
blkCnt = numSamples >> 2;
blkCnt -= 1;
if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
pSrcA += 4;
pSrcB += 4;
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
while (blkCnt > 0) {
vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 4;
/*loop Unrolling */
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 4;
vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 4;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
/*
* tail
*/
blkCnt = CMPLX_DIM * (numSamples & 3);
while (blkCnt > 0) {
mve_pred16_t p = vctp32q(blkCnt);
pSrcA += 4;
pSrcB += 4;
vecSrcA = vldrwq_z_f32(pSrcA, p);
vecSrcB = vldrwq_z_f32(pSrcB, p);
vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
blkCnt -= 4;
}
} else {
/* small vector */
blkCnt = numSamples * CMPLX_DIM;
vec_acc = vdupq_n_f32(0.0f);
do {
mve_pred16_t p = vctp32q(blkCnt);
vecSrcA = vldrwq_z_f32(pSrcA, p);
vecSrcB = vldrwq_z_f32(pSrcB, p);
vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
blkCnt -= 4;
}
while (blkCnt > 0);
}
real_sum = vgetq_lane(vec_acc, 0) + vgetq_lane(vec_acc, 2);
imag_sum = vgetq_lane(vec_acc, 1) + vgetq_lane(vec_acc, 3);
/*
* Store the real and imaginary results in the destination buffers
*/
*realResult = real_sum;
*imagResult = imag_sum;
}
#else
void arm_cmplx_dot_prod_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
uint32_t numSamples,
float32_t * realResult,
float32_t * imagResult)
{
uint32_t blkCnt; /* Loop counter */
float32_t real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result variables */
float32_t a0,b0,c0,d0;
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4x2_t vec1,vec2,vec3,vec4;
float32x4_t accR,accI;
float32x2_t accum = vdup_n_f32(0);
accR = vdupq_n_f32(0.0f);
accI = vdupq_n_f32(0.0f);
/* Loop unrolling: Compute 8 outputs at a time */
blkCnt = numSamples >> 3U;
while (blkCnt > 0U)
{
/* C = (A[0]+jA[1])*(B[0]+jB[1]) + ... */
/* Calculate dot product and then store the result in a temporary buffer. */
vec1 = vld2q_f32(pSrcA);
vec2 = vld2q_f32(pSrcB);
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
accR = vmlaq_f32(accR,vec1.val[0],vec2.val[0]);
accR = vmlsq_f32(accR,vec1.val[1],vec2.val[1]);
/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
accI = vmlaq_f32(accI,vec1.val[1],vec2.val[0]);
accI = vmlaq_f32(accI,vec1.val[0],vec2.val[1]);
vec3 = vld2q_f32(pSrcA);
vec4 = vld2q_f32(pSrcB);
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
accR = vmlaq_f32(accR,vec3.val[0],vec4.val[0]);
accR = vmlsq_f32(accR,vec3.val[1],vec4.val[1]);
/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
accI = vmlaq_f32(accI,vec3.val[1],vec4.val[0]);
accI = vmlaq_f32(accI,vec3.val[0],vec4.val[1]);
/* Decrement the loop counter */
blkCnt--;
}
accum = vpadd_f32(vget_low_f32(accR), vget_high_f32(accR));
real_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(accI), vget_high_f32(accI));
imag_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
/* Tail */
blkCnt = numSamples & 0x7;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement the loop counter */
blkCnt--;
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples & 0x3U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement the loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
/* Decrement the loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the real and imaginary results in the destination buffers */
/* Store real and imaginary result in destination buffer. */
*realResult = real_sum;
*imagResult = imag_sum;
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of cmplx_dot_prod group
@} end of cmplx_dot_prod group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_dot_prod_q15.c
* Description: Processing function for the Q15 Complex Dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,152 +26,231 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_dot_prod
* @{
@addtogroup cmplx_dot_prod
@{
*/
/**
* @brief Q15 complex dot product
* @param *pSrcA points to the first input vector
* @param *pSrcB points to the second input vector
* @param numSamples number of complex samples in each vector
* @param *realResult real part of the result returned here
* @param *imagResult imaginary part of the result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function is implemented using an internal 64-bit accumulator.
* The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
* These are accumulated in a 64-bit accumulator with 34.30 precision.
* As a final step, the accumulators are converted to 8.24 format.
* The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
@brief Q15 complex dot product.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] numSamples number of samples in each vector
@param[out] realResult real part of the result returned here
@param[out] imagResult imaginary part of the result returned her
@return none
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
These are accumulated in a 64-bit accumulator with 34.30 precision.
As a final step, the accumulators are converted to 8.24 format.
The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_dot_prod_q15(
q15_t * pSrcA,
q15_t * pSrcB,
uint32_t numSamples,
q31_t * realResult,
q31_t * imagResult)
const q15_t * pSrcA,
const q15_t * pSrcB,
uint32_t numSamples,
q31_t * realResult,
q31_t * imagResult)
{
q63_t real_sum = 0, imag_sum = 0; /* Temporary result storage */
q15_t a0,b0,c0,d0;
int32_t blkCnt;
q63_t accReal = 0LL;
q63_t accImag = 0LL;
q15x8_t vecSrcA, vecSrcB;
q15x8_t vecSrcC, vecSrcD;
#if defined (ARM_MATH_DSP)
blkCnt = (numSamples >> 3);
blkCnt -= 1;
if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
pSrcA += 8;
pSrcB += 8;
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
while (blkCnt > 0) {
accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 8;
/*loop Unrolling */
accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 8;
accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 8;
accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
/*
* tail
*/
blkCnt = CMPLX_DIM * (numSamples & 7);
do {
mve_pred16_t p = vctp16q(blkCnt);
pSrcA += 8;
pSrcB += 8;
vecSrcA = vldrhq_z_s16(pSrcA, p);
vecSrcB = vldrhq_z_s16(pSrcB, p);
accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p);
accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p);
blkCnt -= 8;
}
while ((int32_t) blkCnt > 0);
} else {
blkCnt = numSamples * CMPLX_DIM;
while (blkCnt > 0) {
mve_pred16_t p = vctp16q(blkCnt);
vecSrcA = vldrhq_z_s16(pSrcA, p);
vecSrcB = vldrhq_z_s16(pSrcB, p);
accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p);
accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
blkCnt -= 8;
}
}
*realResult = asrl(accReal, (14 - 8));
*imagResult = asrl(accImag, (14 - 8));
}
#else
void arm_cmplx_dot_prod_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
uint32_t numSamples,
q31_t * realResult,
q31_t * imagResult)
{
uint32_t blkCnt; /* Loop counter */
q63_t real_sum = 0, imag_sum = 0; /* Temporary result variables */
q15_t a0,b0,c0,d0;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
/* Decrement the loop counter */
blkCnt--;
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += a0 * c0;
imag_sum += a0 * d0;
real_sum -= b0 * d0;
imag_sum += b0 * c0;
real_sum += (q31_t)a0 * c0;
imag_sum += (q31_t)a0 * d0;
real_sum -= (q31_t)b0 * d0;
imag_sum += (q31_t)b0 * c0;
/* Decrement the loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the real and imaginary results in 8.24 format */
/* Store real and imaginary result in 8.24 format */
/* Convert real data in 34.30 to 8.24 by 6 right shifts */
*realResult = (q31_t) (real_sum >> 6);
/* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
*imagResult = (q31_t) (imag_sum >> 6);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of cmplx_dot_prod group
@} end of cmplx_dot_prod group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_dot_prod_q31.c
* Description: Q31 complex dot product
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,150 +26,234 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_dot_prod
* @{
@addtogroup cmplx_dot_prod
@{
*/
/**
* @brief Q31 complex dot product
* @param *pSrcA points to the first input vector
* @param *pSrcB points to the second input vector
* @param numSamples number of complex samples in each vector
* @param *realResult real part of the result returned here
* @param *imagResult imaginary part of the result returned here
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function is implemented using an internal 64-bit accumulator.
* The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
* The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
* Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
* The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
* Input down scaling is not required.
@brief Q31 complex dot product.
@param[in] pSrcA points to the first input vector
@param[in] pSrcB points to the second input vector
@param[in] numSamples number of samples in each vector
@param[out] realResult real part of the result returned here
@param[out] imagResult imaginary part of the result returned here
@return none
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
Input down scaling is not required.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_dot_prod_q31(
q31_t * pSrcA,
q31_t * pSrcB,
uint32_t numSamples,
q63_t * realResult,
q63_t * imagResult)
const q31_t * pSrcA,
const q31_t * pSrcB,
uint32_t numSamples,
q63_t * realResult,
q63_t * imagResult)
{
q63_t real_sum = 0, imag_sum = 0; /* Temporary result storage */
q31_t a0,b0,c0,d0;
int32_t blkCnt;
q63_t accReal = 0LL;
q63_t accImag = 0LL;
q31x4_t vecSrcA, vecSrcB;
q31x4_t vecSrcC, vecSrcD;
#if defined (ARM_MATH_DSP)
blkCnt = numSamples >> 2;
blkCnt -= 1;
if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
pSrcA += 4;
pSrcB += 4;
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
while (blkCnt > 0) {
accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 4;
/*loop Unrolling */
accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 4;
accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 4;
accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
/*
* tail
*/
blkCnt = CMPLX_DIM * (numSamples & 3);
do {
mve_pred16_t p = vctp32q(blkCnt);
pSrcA += 4;
pSrcB += 4;
vecSrcA = vldrwq_z_s32(pSrcA, p);
vecSrcB = vldrwq_z_s32(pSrcB, p);
accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
blkCnt -= 4;
}
while ((int32_t) blkCnt > 0);
} else {
blkCnt = numSamples * CMPLX_DIM;
while (blkCnt > 0) {
mve_pred16_t p = vctp32q(blkCnt);
vecSrcA = vldrwq_z_s32(pSrcA, p);
vecSrcB = vldrwq_z_s32(pSrcB, p);
accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
blkCnt -= 4;
}
}
*realResult = asrl(accReal, (14 - 8));
*imagResult = asrl(accImag, (14 - 8));
}
#else
void arm_cmplx_dot_prod_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
uint32_t numSamples,
q63_t * realResult,
q63_t * imagResult)
{
uint32_t blkCnt; /* Loop counter */
q63_t real_sum = 0, imag_sum = 0; /* Temporary result variables */
q31_t a0,b0,c0,d0;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
/* Decrement the loop counter */
blkCnt--;
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
a0 = *pSrcA++;
b0 = *pSrcA++;
c0 = *pSrcB++;
d0 = *pSrcB++;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
real_sum += ((q63_t)a0 * c0) >> 14;
imag_sum += ((q63_t)a0 * d0) >> 14;
real_sum -= ((q63_t)b0 * d0) >> 14;
imag_sum += ((q63_t)b0 * c0) >> 14;
/* Decrement the loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Store the real and imaginary results in 16.48 format */
/* Store real and imaginary result in 16.48 format */
*realResult = real_sum;
*imagResult = imag_sum;
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of cmplx_dot_prod group
@} end of cmplx_dot_prod group
*/

View File

@@ -0,0 +1,241 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mag_f16.c
* Description: Floating-point complex magnitude
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_mag Complex Magnitude
Computes the magnitude of the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the where the result should be written.
<code>numSamples</code> specifies the number of complex samples
in the input array and the data is stored in an interleaved fashion
(real, imag, real, imag, ...).
The input array has a total of <code>2*numSamples</code> values;
the output array has a total of <code>numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_mag
@{
*/
/**
@brief Floating-point complex magnitude.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_cmplx_mag_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t numSamples)
{
int32_t blockSize = numSamples; /* loop counters */
uint32_t blkCnt; /* loop counters */
f16x8x2_t vecSrc;
f16x8_t sum;
/* Compute 4 complex samples at a time */
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
q15x8_t newtonStartVec;
f16x8_t sumHalf, invSqrt;
vecSrc = vld2q(pSrc);
pSrc += 16;
sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
/*
* inlined Fast SQRT using inverse SQRT newton-raphson method
*/
/* compute initial value */
newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
sumHalf = sum * 0.5f;
/*
* compute 3 x iterations
*
* The more iterations, the more accuracy.
* If you need to trade a bit of accuracy for more performance,
* you can comment out the 3rd use of the macro.
*/
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
/*
* set negative values to 0
*/
invSqrt = vdupq_m(invSqrt, (float16_t)0.0f, vcmpltq(invSqrt, (float16_t)0.0f));
/*
* sqrt(x) = x * invSqrt(x)
*/
sum = vmulq(sum, invSqrt);
vstrhq_f16(pDst, sum);
pDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = blockSize & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
q15x8_t newtonStartVec;
f16x8_t sumHalf, invSqrt;
vecSrc = vld2q((float16_t const *)pSrc);
sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
/*
* inlined Fast SQRT using inverse SQRT newton-raphson method
*/
/* compute initial value */
newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
sumHalf = vmulq(sum, (float16_t)0.5);
/*
* compute 2 x iterations
*/
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
/*
* set negative values to 0
*/
invSqrt = vdupq_m(invSqrt, (float16_t)0.0, vcmpltq(invSqrt, (float16_t)0.0));
/*
* sqrt(x) = x * invSqrt(x)
*/
sum = vmulq(sum, invSqrt);
vstrhq_p_f16(pDst, sum, p0);
}
}
#else
void arm_cmplx_mag_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* loop counter */
_Float16 real, imag; /* Temporary variables to hold input values */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
arm_sqrt_f16((real * real) + (imag * imag), pDst++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_mag group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_f32.c
* Description: Floating-point complex magnitude
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,128 +26,248 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @defgroup cmplx_mag Complex Magnitude
*
* Computes the magnitude of the elements of a complex data vector.
*
* The <code>pSrc</code> points to the source data and
* <code>pDst</code> points to the where the result should be written.
* <code>numSamples</code> specifies the number of complex samples
* in the input array and the data is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* The input array has a total of <code>2*numSamples</code> values;
* the output array has a total of <code>numSamples</code> values.
* The underlying algorithm is used:
*
* <pre>
* for(n=0; n<numSamples; n++) {
* pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
@defgroup cmplx_mag Complex Magnitude
Computes the magnitude of the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the where the result should be written.
<code>numSamples</code> specifies the number of complex samples
in the input array and the data is stored in an interleaved fashion
(real, imag, real, imag, ...).
The input array has a total of <code>2*numSamples</code> values;
the output array has a total of <code>numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup cmplx_mag
* @{
@addtogroup cmplx_mag
@{
*/
/**
* @brief Floating-point complex magnitude.
* @param[in] *pSrc points to complex input buffer
* @param[out] *pDst points to real output buffer
* @param[in] numSamples number of complex samples in the input vector
* @return none.
*
@brief Floating-point complex magnitude.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_vec_math.h"
#endif
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_cmplx_mag_f32(
float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
float32_t realIn, imagIn; /* Temporary variables to hold input values */
int32_t blockSize = numSamples; /* loop counters */
uint32_t blkCnt; /* loop counters */
f32x4x2_t vecSrc;
f32x4_t sum;
float32_t real, imag; /* Temporary variables to hold input values */
#if defined (ARM_MATH_DSP)
/* Compute 4 complex samples at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
q31x4_t newtonStartVec;
f32x4_t sumHalf, invSqrt;
/* Run the below code for Cortex-M4 and Cortex-M3 */
vecSrc = vld2q(pSrc);
pSrc += 8;
sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
/*
* inlined Fast SQRT using inverse SQRT newton-raphson method
*/
/* compute initial value */
newtonStartVec = vdupq_n_s32(INVSQRT_MAGIC_F32) - vshrq((q31x4_t) sum, 1);
sumHalf = sum * 0.5f;
/*
* compute 3 x iterations
*
* The more iterations, the more accuracy.
* If you need to trade a bit of accuracy for more performance,
* you can comment out the 3rd use of the macro.
*/
INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, (f32x4_t) newtonStartVec);
INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
/*
* set negative values to 0
*/
invSqrt = vdupq_m(invSqrt, 0.0f, vcmpltq(invSqrt, 0.0f));
/*
* sqrt(x) = x * invSqrt(x)
*/
sum = vmulq(sum, invSqrt);
vst1q(pDst, sum);
pDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = blockSize & 3;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_mag_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* loop counter */
float32_t real, imag; /* Temporary variables to hold input values */
/*loop Unrolling */
blkCnt = numSamples >> 2U;
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4x2_t vecA;
float32x4_t vRealA;
float32x4_t vImagA;
float32x4_t vMagSqA;
float32x4x2_t vecB;
float32x4_t vRealB;
float32x4_t vImagB;
float32x4_t vMagSqB;
/* Loop unrolling: Compute 8 outputs at a time */
blkCnt = numSamples >> 3;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* out = sqrt((real * real) + (imag * imag)) */
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
realIn = *pSrc++;
imagIn = *pSrc++;
/* store the result in the destination buffer. */
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
vecA = vld2q_f32(pSrc);
pSrc += 8;
realIn = *pSrc++;
imagIn = *pSrc++;
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
vecB = vld2q_f32(pSrc);
pSrc += 8;
realIn = *pSrc++;
imagIn = *pSrc++;
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
vMagSqA = vaddq_f32(vRealA, vImagA);
realIn = *pSrc++;
imagIn = *pSrc++;
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
vMagSqB = vaddq_f32(vRealB, vImagB);
/* Store the result in the destination buffer. */
vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqA));
pDst += 4;
vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqB));
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
realIn = *pSrc++;
imagIn = *pSrc++;
/* store the result in the destination buffer. */
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
/* Decrement the loop counter */
blkCnt--;
}
blkCnt = numSamples & 7;
#else
/* Run the below code for Cortex-M0 */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
while (numSamples > 0U)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* out = sqrt((real * real) + (imag * imag)) */
realIn = *pSrc++;
imagIn = *pSrc++;
/* store the result in the destination buffer. */
arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
/* Decrement the loop counter */
numSamples--;
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
real = *pSrc++;
imag = *pSrc++;
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
arm_sqrt_f32((real * real) + (imag * imag), pDst++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of cmplx_mag group
@} end of cmplx_mag group
*/

View File

@@ -0,0 +1,100 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mag_f64.c
* Description: Floating-point complex magnitude
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions.h"
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_mag Complex Magnitude
Computes the magnitude of the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the where the result should be written.
<code>numSamples</code> specifies the number of complex samples
in the input array and the data is stored in an interleaved fashion
(real, imag, real, imag, ...).
The input array has a total of <code>2*numSamples</code> values;
the output array has a total of <code>numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_mag
@{
*/
/**
@brief Floating-point complex magnitude.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
void arm_cmplx_mag_f64(
const float64_t * pSrc,
float64_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* loop counter */
float64_t real, imag; /* Temporary variables to hold input values */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
*pDst++ = sqrt((real * real) + (imag * imag));
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of cmplx_mag group
*/

View File

@@ -0,0 +1,223 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mag_fast_q15.c
* Description: Q15 complex magnitude
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions.h"
/**
@ingroup groupCmplxMath
*/
/**
@addtogroup cmplx_mag
@{
*/
/**
@brief Q15 complex magnitude.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.
Fast functions are less accurate. This function will tend to clamp to 0
the too small values. So sqrt(x*x) = x will not always be true.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_cmplx_mag_fast_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
{
int32_t blockSize = numSamples; /* loop counters */
uint32_t blkCnt; /* loop counters */
q15x8x2_t vecSrc;
q15x8_t sum;
q31_t in;
q31_t acc0;
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrc = vld2q(pSrc);
pSrc += 16;
sum = vqaddq(vmulhq(vecSrc.val[0], vecSrc.val[0]),
vmulhq(vecSrc.val[1], vecSrc.val[1]));
sum = vshrq(sum, 1);
sum = FAST_VSQRT_Q15(sum);
vst1q(pDst, sum);
pDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = blockSize & 7;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
in = read_q15x2_ia ((q15_t **) &pSrc);
acc0 = __SMUAD(in, in);
/* store result in 2.14 format in destination buffer. */
arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_mag_fast_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
q31_t in;
q31_t acc0; /* Accumulators */
#else
q15_t real, imag; /* Temporary input variables */
q31_t acc0, acc1; /* Accumulators */
#endif
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
#if defined (ARM_MATH_DSP)
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
/* store result in 2.14 format in destination buffer. */
arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
#else
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
/* store result in 2.14 format in destination buffer. */
arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
#if defined (ARM_MATH_DSP)
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
/* store result in 2.14 format in destination buffer. */
arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
#else
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
/* store result in 2.14 format in destination buffer. */
arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of cmplx_mag group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_q15.c
* Description: Q15 complex magnitude
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,116 +26,243 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_mag
* @{
@addtogroup cmplx_mag
@{
*/
/**
* @brief Q15 complex magnitude
* @param *pSrc points to the complex input vector
* @param *pDst points to the real output vector
* @param numSamples number of complex samples in the input vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.
@brief Q15 complex magnitude.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.
*/
/* Sqrt q31 is used otherwise accuracy is not good enough
for small values and for some applications it is
an issue.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_cmplx_mag_q15(
q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
{
q31_t acc0, acc1; /* Accumulators */
int32_t blockSize = numSamples; /* loop counters */
uint32_t blkCnt; /* loop counters */
q15x8x2_t vecSrc;
q31x4_t prod0;
q31x4_t prod1;
q31_t in;
q31_t acc0;
q31x4_t acc0V;
q31x4_t acc1V;
q31_t res;
q15x8_t resV;
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrc = vld2q(pSrc);
pSrc += 16;
acc0V = vdupq_n_s32(0);
acc1V = vdupq_n_s32(0);
prod0 = vmullbq_int_s16(vecSrc.val[0], vecSrc.val[0]);
acc0V = vqaddq_s32(acc0V,prod0);
prod0 = vmullbq_int_s16(vecSrc.val[1], vecSrc.val[1]);
acc0V = vqaddq_s32(acc0V,prod0);
prod1 = vmulltq_int_s16(vecSrc.val[0], vecSrc.val[0]);
acc1V = vqaddq_s32(acc1V,prod1);
prod1 = vmulltq_int_s16(vecSrc.val[1], vecSrc.val[1]);
acc1V = vqaddq_s32(acc1V,prod1);
acc0V = vshrq(acc0V, 1);
acc1V = vshrq(acc1V, 1);
acc0V = FAST_VSQRT_Q31(acc0V);
acc1V = FAST_VSQRT_Q31(acc1V);
resV = vdupq_n_s16(0);
resV = vqshrnbq_n_s32(resV,acc0V,16);
resV = vqshrntq_n_s32(resV,acc1V,16);
vst1q(pDst, resV);
pDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = blockSize & 7;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
in = read_q15x2_ia ((q15_t **) &pSrc);
acc0 = __SMUAD(in, in);
/* store result in 2.14 format in destination buffer. */
arm_sqrt_q31(acc0 >> 1 , &res);
*pDst++ = res >> 16;
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_mag_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
{
q31_t res; /* temporary result */
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
q31_t in;
q31_t acc0; /* Accumulators */
#else
q15_t real, imag; /* Temporary input variables */
q31_t acc0, acc1; /* Accumulators */
#endif
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
q31_t in1, in2, in3, in4;
q31_t acc2, acc3;
#if defined (ARM_MATH_LOOPUNROLL)
/*loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
in1 = *__SIMD32(pSrc)++;
in2 = *__SIMD32(pSrc)++;
in3 = *__SIMD32(pSrc)++;
in4 = *__SIMD32(pSrc)++;
acc0 = __SMUAD(in1, in1);
acc1 = __SMUAD(in2, in2);
acc2 = __SMUAD(in3, in3);
acc3 = __SMUAD(in4, in4);
#if defined (ARM_MATH_DSP)
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
/* store result in 2.14 format in destination buffer. */
arm_sqrt_q31(acc0 >> 1 , &res);
*pDst++ = res >> 16;
/* store the result in 2.14 format in the destination buffer. */
arm_sqrt_q15((q15_t) ((acc0) >> 17), pDst++);
arm_sqrt_q15((q15_t) ((acc1) >> 17), pDst++);
arm_sqrt_q15((q15_t) ((acc2) >> 17), pDst++);
arm_sqrt_q15((q15_t) ((acc3) >> 17), pDst++);
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
arm_sqrt_q31(acc0 >> 1 , &res);
*pDst++ = res >> 16;
/* Decrement the loop counter */
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
arm_sqrt_q31(acc0 >> 1 , &res);
*pDst++ = res >> 16;
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
arm_sqrt_q31(acc0 >> 1 , &res);
*pDst++ = res >> 16;
#else
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
/* store result in 2.14 format in destination buffer. */
arm_sqrt_q31((acc0 + acc1) >> 1 , &res);
*pDst++ = res >> 16;
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
arm_sqrt_q31((acc0 + acc1) >> 1 , &res);
*pDst++ = res >> 16;
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
arm_sqrt_q31((acc0 + acc1) >> 1 , &res);
*pDst++ = res >> 16;
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
arm_sqrt_q31((acc0 + acc1) >> 1 , &res);
*pDst++ = res >> 16;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
in1 = *__SIMD32(pSrc)++;
acc0 = __SMUAD(in1, in1);
/* store the result in 2.14 format in the destination buffer. */
arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
q15_t real, imag; /* Temporary variables to hold input values */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* out = sqrt(real * real + imag * imag) */
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
#if defined (ARM_MATH_DSP)
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
/* store result in 2.14 format in destination buffer. */
arm_sqrt_q31(acc0 >> 1 , &res);
*pDst++ = res >> 16;
#else
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
acc0 = (real * real);
acc1 = (imag * imag);
/* store result in 2.14 format in destination buffer. */
arm_sqrt_q31((acc0 + acc1) >> 1 , &res);
*pDst++ = res >> 16;
#endif
/* store the result in 2.14 format in the destination buffer. */
arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
/* Decrement the loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of cmplx_mag group
@} end of cmplx_mag group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_q31.c
* Description: Q31 complex magnitude
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,148 +26,176 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_mag
* @{
@addtogroup cmplx_mag
@{
*/
/**
* @brief Q31 complex magnitude
* @param *pSrc points to the complex input vector
* @param *pDst points to the real output vector
* @param numSamples number of complex samples in the input vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function implements 1.31 by 1.31 multiplications and finally output is converted into 2.30 format.
* Input down scaling is not required.
@brief Q31 complex magnitude.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function implements 1.31 by 1.31 multiplications and finally output is converted into 2.30 format.
Input down scaling is not required.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
void arm_cmplx_mag_q31(
q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
const q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
{
q31_t real, imag; /* Temporary variables to hold input values */
q31_t acc0, acc1; /* Accumulators */
uint32_t blkCnt; /* loop counter */
int32_t blockSize = numSamples; /* loop counters */
uint32_t blkCnt; /* loop counters */
#if defined (ARM_MATH_DSP)
q31x4x2_t vecSrc;
q31x4_t sum;
/* Run the below code for Cortex-M4 and Cortex-M3 */
q31_t real1, real2, imag1, imag2; /* Temporary variables to hold input values */
q31_t out1, out2, out3, out4; /* Accumulators */
q63_t mul1, mul2, mul3, mul4; /* Temporary variables */
q31_t real, imag; /* Temporary input variables */
q31_t acc0, acc1; /* Accumulators */
/* Compute 4 complex samples at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
vecSrc = vld2q(pSrc);
/*loop Unrolling */
blkCnt = numSamples >> 2U;
sum = vqaddq(vmulhq(vecSrc.val[0], vecSrc.val[0]),
vmulhq(vecSrc.val[1], vecSrc.val[1]));
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* read complex input from source buffer */
real1 = pSrc[0];
imag1 = pSrc[1];
real2 = pSrc[2];
imag2 = pSrc[3];
sum = vshrq(sum, 1);
/* calculate power of input values */
mul1 = (q63_t) real1 *real1;
mul2 = (q63_t) imag1 *imag1;
mul3 = (q63_t) real2 *real2;
mul4 = (q63_t) imag2 *imag2;
/*
/* get the result to 3.29 format */
out1 = (q31_t) (mul1 >> 33);
out2 = (q31_t) (mul2 >> 33);
out3 = (q31_t) (mul3 >> 33);
out4 = (q31_t) (mul4 >> 33);
This function is using a table. There are compilations flags to avoid
including this table (and in this case, arm_cmplx_maq_q31 must not
be built and linked.)
/* add real and imaginary accumulators */
out1 = out1 + out2;
out3 = out3 + out4;
*/
sum = FAST_VSQRT_Q31(sum);
/* read complex input from source buffer */
real1 = pSrc[4];
imag1 = pSrc[5];
real2 = pSrc[6];
imag2 = pSrc[7];
vst1q(pDst, sum);
/* calculate square root */
arm_sqrt_q31(out1, &pDst[0]);
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
pSrc += 8;
pDst += 4;
}
/* calculate power of input values */
mul1 = (q63_t) real1 *real1;
/* calculate square root */
arm_sqrt_q31(out3, &pDst[1]);
/* calculate power of input values */
mul2 = (q63_t) imag1 *imag1;
mul3 = (q63_t) real2 *real2;
mul4 = (q63_t) imag2 *imag2;
/* get the result to 3.29 format */
out1 = (q31_t) (mul1 >> 33);
out2 = (q31_t) (mul2 >> 33);
out3 = (q31_t) (mul3 >> 33);
out4 = (q31_t) (mul4 >> 33);
/* add real and imaginary accumulators */
out1 = out1 + out2;
out3 = out3 + out4;
/* calculate square root */
arm_sqrt_q31(out1, &pDst[2]);
/* increment destination by 8 to process next samples */
pSrc += 8U;
/* calculate square root */
arm_sqrt_q31(out3, &pDst[3]);
/* increment destination by 4 to process next samples */
pDst += 4U;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
/*
* tail
*/
blkCnt = blockSize & 3;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store result in 2.30 format in destination buffer. */
arm_sqrt_q31(acc0 + acc1, pDst++);
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_mag_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
q31_t real, imag; /* Temporary input variables */
q31_t acc0, acc1; /* Accumulators */
/* Run the below code for Cortex-M0 */
blkCnt = numSamples;
#if defined (ARM_MATH_LOOPUNROLL)
#endif /* #if defined (ARM_MATH_DSP) */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store the result in 2.30 format in the destination buffer. */
/* store result in 2.30 format in destination buffer. */
arm_sqrt_q31(acc0 + acc1, pDst++);
/* Decrement the loop counter */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
arm_sqrt_q31(acc0 + acc1, pDst++);
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
arm_sqrt_q31(acc0 + acc1, pDst++);
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
arm_sqrt_q31(acc0 + acc1, pDst++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store result in 2.30 format in destination buffer. */
arm_sqrt_q31(acc0 + acc1, pDst++);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of cmplx_mag group
@} end of cmplx_mag group
*/

View File

@@ -0,0 +1,174 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mag_squared_f16.c
* Description: Floating-point complex magnitude squared
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupCmplxMath
*/
/**
@defgroup cmplx_mag_squared Complex Magnitude Squared
Computes the magnitude squared of the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the where the result should be written.
<code>numSamples</code> specifies the number of complex samples
in the input array and the data is stored in an interleaved fashion
(real, imag, real, imag, ...).
The input array has a total of <code>2*numSamples</code> values;
the output array has a total of <code>numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[n] = pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2;
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup cmplx_mag_squared
@{
*/
/**
@brief Floating-point complex magnitude squared.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_mag_squared_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t numSamples)
{
int32_t blockSize = numSamples; /* loop counters */
f16x8x2_t vecSrc;
f16x8_t sum;
/* Compute 4 complex samples at a time */
while (blockSize > 0)
{
mve_pred16_t p = vctp16q(blockSize);
vecSrc = vld2q(pSrc);
sum = vmulq_m(vuninitializedq_f16(),vecSrc.val[0], vecSrc.val[0],p);
sum = vfmaq_m(sum, vecSrc.val[1], vecSrc.val[1],p);
vstrhq_p_f16(pDst, sum,p);
pSrc += 16;
pDst += 8;
/*
* Decrement the blockSize loop counter
*/
blockSize-= 8;
}
}
#else
void arm_cmplx_mag_squared_f16(
const float16_t * pSrc,
float16_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
_Float16 real, imag; /* Temporary input variables */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
*pDst++ = (real * real) + (imag * imag);
real = *pSrc++;
imag = *pSrc++;
*pDst++ = (real * real) + (imag * imag);
real = *pSrc++;
imag = *pSrc++;
*pDst++ = (real * real) + (imag * imag);
real = *pSrc++;
imag = *pSrc++;
*pDst++ = (real * real) + (imag * imag);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
*pDst++ = (real * real) + (imag * imag);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_mag_squared group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_squared_f32.c
* Description: Floating-point complex magnitude squared
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,179 +26,210 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @defgroup cmplx_mag_squared Complex Magnitude Squared
*
* Computes the magnitude squared of the elements of a complex data vector.
*
* The <code>pSrc</code> points to the source data and
* <code>pDst</code> points to the where the result should be written.
* <code>numSamples</code> specifies the number of complex samples
* in the input array and the data is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* The input array has a total of <code>2*numSamples</code> values;
* the output array has a total of <code>numSamples</code> values.
*
* The underlying algorithm is used:
*
* <pre>
* for(n=0; n<numSamples; n++) {
* pDst[n] = pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2;
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
@defgroup cmplx_mag_squared Complex Magnitude Squared
Computes the magnitude squared of the elements of a complex data vector.
The <code>pSrc</code> points to the source data and
<code>pDst</code> points to the where the result should be written.
<code>numSamples</code> specifies the number of complex samples
in the input array and the data is stored in an interleaved fashion
(real, imag, real, imag, ...).
The input array has a total of <code>2*numSamples</code> values;
the output array has a total of <code>numSamples</code> values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[n] = pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2;
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup cmplx_mag_squared
* @{
@addtogroup cmplx_mag_squared
@{
*/
/**
* @brief Floating-point complex magnitude squared
* @param[in] *pSrc points to the complex input vector
* @param[out] *pDst points to the real output vector
* @param[in] numSamples number of complex samples in the input vector
* @return none.
@brief Floating-point complex magnitude squared.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_mag_squared_f32(
float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
float32_t real, imag; /* Temporary variables to store real and imaginary values */
uint32_t blkCnt; /* loop counter */
int32_t blockSize = numSamples; /* loop counters */
uint32_t blkCnt; /* loop counters */
f32x4x2_t vecSrc;
f32x4_t sum;
float32_t real, imag; /* Temporary input variables */
#if defined (ARM_MATH_DSP)
float32_t real1, real2, real3, real4; /* Temporary variables to hold real values */
float32_t imag1, imag2, imag3, imag4; /* Temporary variables to hold imaginary values */
float32_t mul1, mul2, mul3, mul4; /* Temporary variables */
float32_t mul5, mul6, mul7, mul8; /* Temporary variables */
float32_t out1, out2, out3, out4; /* Temporary variables to hold output values */
/* Compute 4 complex samples at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
vecSrc = vld2q(pSrc);
sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
vst1q(pDst, sum);
/*loop Unrolling */
blkCnt = numSamples >> 2U;
pSrc += 8;
pDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Tail */
blkCnt = blockSize & 3;
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
*pDst++ = (real * real) + (imag * imag);
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_mag_squared_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
float32_t real, imag; /* Temporary input variables */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4x2_t vecA;
float32x4_t vRealA;
float32x4_t vImagA;
float32x4_t vMagSqA;
float32x4x2_t vecB;
float32x4_t vRealB;
float32x4_t vImagB;
float32x4_t vMagSqB;
/* Loop unrolling: Compute 8 outputs at a time */
blkCnt = numSamples >> 3;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
/* read real input sample from source buffer */
real1 = pSrc[0];
/* read imaginary input sample from source buffer */
imag1 = pSrc[1];
/* out = sqrt((real * real) + (imag * imag)) */
/* calculate power of real value */
mul1 = real1 * real1;
vecA = vld2q_f32(pSrc);
pSrc += 8;
/* read real input sample from source buffer */
real2 = pSrc[2];
vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
vMagSqA = vaddq_f32(vRealA, vImagA);
/* calculate power of imaginary value */
mul2 = imag1 * imag1;
vecB = vld2q_f32(pSrc);
pSrc += 8;
/* read imaginary input sample from source buffer */
imag2 = pSrc[3];
vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
vMagSqB = vaddq_f32(vRealB, vImagB);
/* calculate power of real value */
mul3 = real2 * real2;
/* Store the result in the destination buffer. */
vst1q_f32(pDst, vMagSqA);
pDst += 4;
/* read real input sample from source buffer */
real3 = pSrc[4];
/* calculate power of imaginary value */
mul4 = imag2 * imag2;
/* read imaginary input sample from source buffer */
imag3 = pSrc[5];
/* calculate power of real value */
mul5 = real3 * real3;
/* calculate power of imaginary value */
mul6 = imag3 * imag3;
/* read real input sample from source buffer */
real4 = pSrc[6];
/* accumulate real and imaginary powers */
out1 = mul1 + mul2;
/* read imaginary input sample from source buffer */
imag4 = pSrc[7];
/* accumulate real and imaginary powers */
out2 = mul3 + mul4;
/* calculate power of real value */
mul7 = real4 * real4;
/* calculate power of imaginary value */
mul8 = imag4 * imag4;
/* store output to destination */
pDst[0] = out1;
/* accumulate real and imaginary powers */
out3 = mul5 + mul6;
/* store output to destination */
pDst[1] = out2;
/* accumulate real and imaginary powers */
out4 = mul7 + mul8;
/* store output to destination */
pDst[2] = out3;
/* increment destination pointer by 8 to process next samples */
pSrc += 8U;
/* store output to destination */
pDst[3] = out4;
/* increment destination pointer by 4 to process next samples */
pDst += 4U;
vst1q_f32(pDst, vMagSqB);
pDst += 4;
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples & 7;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
*pDst++ = (real * real) + (imag * imag);
real = *pSrc++;
imag = *pSrc++;
*pDst++ = (real * real) + (imag * imag);
real = *pSrc++;
imag = *pSrc++;
*pDst++ = (real * real) + (imag * imag);
real = *pSrc++;
imag = *pSrc++;
*pDst++ = (real * real) + (imag * imag);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* out = (real * real) + (imag * imag) */
/* store the result in the destination buffer. */
/* store result in destination buffer. */
*pDst++ = (real * real) + (imag * imag);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of cmplx_mag_squared group
@} end of cmplx_mag_squared group
*/

View File

@@ -0,0 +1,76 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mag_squared_f64.c
* Description: Floating-point complex magnitude squared
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions.h"
/**
@ingroup groupCmplxMath
*/
/**
@addtogroup cmplx_mag_squared
@{
*/
/**
@brief Floating-point complex magnitude squared.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
void arm_cmplx_mag_squared_f64(
const float64_t * pSrc,
float64_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
float64_t real, imag; /* Temporary input variables */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
/* store result in destination buffer. */
*pDst++ = (real * real) + (imag * imag);
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of cmplx_mag_squared group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_squared_q15.c
* Description: Q15 complex magnitude squared
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,111 +26,196 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_mag_squared
* @{
@addtogroup cmplx_mag_squared
@{
*/
/**
* @brief Q15 complex magnitude squared
* @param *pSrc points to the complex input vector
* @param *pDst points to the real output vector
* @param numSamples number of complex samples in the input vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
@brief Q15 complex magnitude squared.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_mag_squared_q15(
q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
{
q31_t acc0, acc1; /* Accumulators */
int32_t blockSize = numSamples; /* loop counters */
uint32_t blkCnt; /* loop counters */
q31_t in;
q31_t acc0; /* Accumulators */
q15x8x2_t vecSrc;
q15x8_t vReal, vImag;
q15x8_t vMagSq;
#if defined (ARM_MATH_DSP)
blkCnt = blockSize >> 3;
while (blkCnt > 0U)
{
vecSrc = vld2q(pSrc);
vReal = vmulhq(vecSrc.val[0], vecSrc.val[0]);
vImag = vmulhq(vecSrc.val[1], vecSrc.val[1]);
vMagSq = vqaddq(vReal, vImag);
vMagSq = vshrq(vMagSq, 1);
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
q31_t in1, in2, in3, in4;
q31_t acc2, acc3;
vst1q(pDst, vMagSq);
/*loop Unrolling */
blkCnt = numSamples >> 2U;
pSrc += 16;
pDst += 8;
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
blkCnt --;
}
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
/*
* tail
*/
blkCnt = blockSize & 7;
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
in1 = *__SIMD32(pSrc)++;
in2 = *__SIMD32(pSrc)++;
in3 = *__SIMD32(pSrc)++;
in4 = *__SIMD32(pSrc)++;
acc0 = __SMUAD(in1, in1);
acc1 = __SMUAD(in2, in2);
acc2 = __SMUAD(in3, in3);
acc3 = __SMUAD(in4, in4);
in = read_q15x2_ia ((q15_t **) &pSrc);
acc0 = __SMUAD(in, in);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ = (q15_t) (acc0 >> 17);
*pDst++ = (q15_t) (acc1 >> 17);
*pDst++ = (q15_t) (acc2 >> 17);
*pDst++ = (q15_t) (acc3 >> 17);
/* Decrement the loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
in1 = *__SIMD32(pSrc)++;
acc0 = __SMUAD(in1, in1);
/* store the result in 3.13 format in the destination buffer. */
/* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) (acc0 >> 17);
/* Decrement the loop counter */
/* Decrement loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
q15_t real, imag; /* Temporary variables to store real and imaginary values */
while (numSamples > 0U)
{
/* out = ((real * real) + (imag * imag)) */
real = *pSrc++;
imag = *pSrc++;
acc0 = (real * real);
acc1 = (imag * imag);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
/* Decrement the loop counter */
numSamples--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#else
void arm_cmplx_mag_squared_q15(
const q15_t * pSrc,
q15_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
#if defined (ARM_MATH_DSP)
q31_t in;
q31_t acc0; /* Accumulators */
#else
q15_t real, imag; /* Temporary input variables */
q31_t acc0, acc1; /* Accumulators */
#endif
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
#if defined (ARM_MATH_DSP)
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
/* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) (acc0 >> 17);
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
*pDst++ = (q15_t) (acc0 >> 17);
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
*pDst++ = (q15_t) (acc0 >> 17);
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
*pDst++ = (q15_t) (acc0 >> 17);
#else
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
/* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
*pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
*pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
*pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
#if defined (ARM_MATH_DSP)
in = read_q15x2_ia (&pSrc);
acc0 = __SMUAD(in, in);
/* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) (acc0 >> 17);
#else
real = *pSrc++;
imag = *pSrc++;
acc0 = ((q31_t) real * real);
acc1 = ((q31_t) imag * imag);
/* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
#endif
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of cmplx_mag_squared group
@} end of cmplx_mag_squared group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_mag_squared_q31.c
* Description: Q31 complex magnitude squared
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,58 +26,102 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup cmplx_mag_squared
* @{
@addtogroup cmplx_mag_squared
@{
*/
/**
* @brief Q31 complex magnitude squared
* @param *pSrc points to the complex input vector
* @param *pDst points to the real output vector
* @param numSamples number of complex samples in the input vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
* Input down scaling is not required.
@brief Q31 complex magnitude squared.
@param[in] pSrc points to input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
Input down scaling is not required.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_mag_squared_q31(
q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
const q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
{
q31_t real, imag; /* Temporary variables to store real and imaginary values */
q31_t acc0, acc1; /* Accumulators */
int32_t blockSize = numSamples; /* loop counters */
uint32_t blkCnt; /* loop counters */
q31x4x2_t vecSrc;
q31x4_t vReal, vImag;
q31x4_t vMagSq;
q31_t real, imag; /* Temporary input variables */
q31_t acc0, acc1; /* Accumulators */
#if defined (ARM_MATH_DSP)
/* Compute 4 complex samples at a time */
blkCnt = blockSize >> 2;
while (blkCnt > 0U)
{
vecSrc = vld2q(pSrc);
vReal = vmulhq(vecSrc.val[0], vecSrc.val[0]);
vImag = vmulhq(vecSrc.val[1], vecSrc.val[1]);
vMagSq = vqaddq(vReal, vImag);
vMagSq = vshrq(vMagSq, 1);
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counter */
vst1q(pDst, vMagSq);
/* loop Unrolling */
pSrc += 8;
pDst += 4;
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
blkCnt --;
}
/* Tail */
blkCnt = blockSize & 3;
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store result in 3.29 format in destination buffer. */
*pDst++ = acc0 + acc1;
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_mag_squared_q31(
const q31_t * pSrc,
q31_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
q31_t real, imag; /* Temporary input variables */
q31_t acc0, acc1; /* Accumulators */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store the result in 3.29 format in the destination buffer. */
*pDst++ = acc0 + acc1;
real = *pSrc++;
imag = *pSrc++;
@@ -90,60 +134,54 @@ void arm_cmplx_mag_squared_q31(
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store the result in 3.29 format in the destination buffer. */
*pDst++ = acc0 + acc1;
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store the result in 3.29 format in the destination buffer. */
*pDst++ = acc0 + acc1;
/* Decrement the loop counter */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
*pDst++ = acc0 + acc1;
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store the result in 3.29 format in the destination buffer. */
*pDst++ = acc0 + acc1;
/* Decrement the loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* out = ((real * real) + (imag * imag)) */
/* C[0] = (A[0] * A[0] + A[1] * A[1]) */
real = *pSrc++;
imag = *pSrc++;
acc0 = (q31_t) (((q63_t) real * real) >> 33);
acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
/* store the result in 3.29 format in the destination buffer. */
/* store result in 3.29 format in destination buffer. */
*pDst++ = acc0 + acc1;
/* Decrement the loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of cmplx_mag_squared group
@} end of cmplx_mag_squared group
*/

View File

@@ -0,0 +1,271 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mult_cmplx_f16.c
* Description: Floating-point complex-by-complex multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupCmplxMath
*/
/**
@defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
Multiplies a complex vector by another complex vector and generates a complex result.
The data in the complex arrays is stored in an interleaved fashion
(real, imag, real, imag, ...).
The parameter <code>numSamples</code> represents the number of complex
samples processed. The complex arrays have a total of <code>2*numSamples</code>
real values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup CmplxByCmplxMult
@{
*/
/**
@brief Floating-point complex-by-complex multiplication.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_mult_cmplx_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t numSamples)
{
int32_t blkCnt;
f16x8_t vecSrcA, vecSrcB;
f16x8_t vecSrcC, vecSrcD;
f16x8_t vec_acc;
blkCnt = (numSamples >> 3);
blkCnt -= 1;
if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
pSrcA += 8;
pSrcB += 8;
while (blkCnt > 0) {
vec_acc = vcmulq(vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 8;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 8;
vst1q(pDst, vec_acc);
pDst += 8;
vec_acc = vcmulq(vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 8;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 8;
vst1q(pDst, vec_acc);
pDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
vec_acc = vcmulq(vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
vst1q(pDst, vec_acc);
pDst += 8;
vec_acc = vcmulq(vecSrcC, vecSrcD);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
vst1q(pDst, vec_acc);
pDst += 8;
/*
* tail
*/
blkCnt = CMPLX_DIM * (numSamples & 7);
while (blkCnt > 0) {
mve_pred16_t p = vctp16q(blkCnt);
pSrcA += 8;
pSrcB += 8;
vecSrcA = vldrhq_z_f16(pSrcA, p);
vecSrcB = vldrhq_z_f16(pSrcB, p);
vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
vstrhq_p_f16(pDst, vec_acc, p);
pDst += 8;
blkCnt -= 8;
}
} else {
/* small vector */
blkCnt = numSamples * CMPLX_DIM;
do {
mve_pred16_t p = vctp16q(blkCnt);
vecSrcA = vldrhq_z_f16(pSrcA, p);
vecSrcB = vldrhq_z_f16(pSrcB, p);
vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
vstrhq_p_f16(pDst, vec_acc, p);
pDst += 8;
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 8;
pSrcB += 8;
blkCnt -= 8;
}
while (blkCnt > 0);
}
}
#else
void arm_cmplx_mult_cmplx_f16(
const float16_t * pSrcA,
const float16_t * pSrcB,
float16_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
_Float16 a, b, c, d; /* Temporary variables to store real and imaginary values */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store result in destination buffer. */
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store result in destination buffer. */
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of CmplxByCmplxMult group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_cmplx_f32.c
* Description: Floating-point complex-by-complex multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,171 +26,280 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
*
* Multiplies a complex vector by another complex vector and generates a complex result.
* The data in the complex arrays is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* The parameter <code>numSamples</code> represents the number of complex
* samples processed. The complex arrays have a total of <code>2*numSamples</code>
* real values.
*
* The underlying algorithm is used:
*
* <pre>
* for(n=0; n<numSamples; n++) {
* pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
* pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
@defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
Multiplies a complex vector by another complex vector and generates a complex result.
The data in the complex arrays is stored in an interleaved fashion
(real, imag, real, imag, ...).
The parameter <code>numSamples</code> represents the number of complex
samples processed. The complex arrays have a total of <code>2*numSamples</code>
real values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup CmplxByCmplxMult
* @{
@addtogroup CmplxByCmplxMult
@{
*/
/**
* @brief Floating-point complex-by-complex multiplication
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] numSamples number of complex samples in each vector
* @return none.
@brief Floating-point complex-by-complex multiplication.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_mult_cmplx_f32(
float32_t * pSrcA,
float32_t * pSrcB,
float32_t * pDst,
uint32_t numSamples)
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t numSamples)
{
float32_t a1, b1, c1, d1; /* Temporary variables to store real and imaginary values */
uint32_t blkCnt; /* loop counters */
int32_t blkCnt;
f32x4_t vecSrcA, vecSrcB;
f32x4_t vecSrcC, vecSrcD;
f32x4_t vec_acc;
#if defined (ARM_MATH_DSP)
blkCnt = numSamples >> 2;
blkCnt -= 1;
if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
pSrcA += 4;
pSrcB += 4;
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t a2, b2, c2, d2; /* Temporary variables to store real and imaginary values */
float32_t acc1, acc2, acc3, acc4;
while (blkCnt > 0) {
vec_acc = vcmulq(vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 4;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 4;
vst1q(pDst, vec_acc);
pDst += 4;
/* loop Unrolling */
vec_acc = vcmulq(vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 4;
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 4;
vst1q(pDst, vec_acc);
pDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
vec_acc = vcmulq(vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
vst1q(pDst, vec_acc);
pDst += 4;
vec_acc = vcmulq(vecSrcC, vecSrcD);
vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
vst1q(pDst, vec_acc);
pDst += 4;
/*
* tail
*/
blkCnt = CMPLX_DIM * (numSamples & 3);
while (blkCnt > 0) {
mve_pred16_t p = vctp32q(blkCnt);
pSrcA += 4;
pSrcB += 4;
vecSrcA = vldrwq_z_f32(pSrcA, p);
vecSrcB = vldrwq_z_f32(pSrcB, p);
vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
vstrwq_p_f32(pDst, vec_acc, p);
pDst += 4;
blkCnt -= 4;
}
} else {
/* small vector */
blkCnt = numSamples * CMPLX_DIM;
vec_acc = vdupq_n_f32(0.0f);
do {
mve_pred16_t p = vctp32q(blkCnt);
vecSrcA = vldrwq_z_f32(pSrcA, p);
vecSrcB = vldrwq_z_f32(pSrcB, p);
vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p);
vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
vstrwq_p_f32(pDst, vec_acc, p);
pDst += 4;
/*
* Decrement the blkCnt loop counter
* Advance vector source and destination pointers
*/
pSrcA += 4;
pSrcB += 4;
blkCnt -= 4;
}
while (blkCnt > 0);
}
}
#else
void arm_cmplx_mult_cmplx_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
float32_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
float32_t a, b, c, d; /* Temporary variables to store real and imaginary values */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4x2_t va, vb;
float32x4x2_t outCplx;
/* Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
va = vld2q_f32(pSrcA); // load & separate real/imag pSrcA (de-interleave 2)
vb = vld2q_f32(pSrcB); // load & separate real/imag pSrcB
/* Increment pointers */
pSrcA += 8;
pSrcB += 8;
/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
outCplx.val[0] = vmulq_f32(va.val[0], vb.val[0]);
outCplx.val[0] = vmlsq_f32(outCplx.val[0], va.val[1], vb.val[1]);
/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
outCplx.val[1] = vmulq_f32(va.val[0], vb.val[1]);
outCplx.val[1] = vmlaq_f32(outCplx.val[1], va.val[1], vb.val[0]);
vst2q_f32(pDst, outCplx);
/* Increment pointer */
pDst += 8;
/* Decrement the loop counter */
blkCnt--;
}
/* Tail */
blkCnt = numSamples & 3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
a1 = *pSrcA; /* A[2 * i] */
c1 = *pSrcB; /* B[2 * i] */
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
b1 = *(pSrcA + 1); /* A[2 * i + 1] */
acc1 = a1 * c1; /* acc1 = A[2 * i] * B[2 * i] */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store result in destination buffer. */
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
a2 = *(pSrcA + 2); /* A[2 * i + 2] */
acc2 = (b1 * c1); /* acc2 = A[2 * i + 1] * B[2 * i] */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
d1 = *(pSrcB + 1); /* B[2 * i + 1] */
c2 = *(pSrcB + 2); /* B[2 * i + 2] */
acc1 -= b1 * d1; /* acc1 = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1] */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
d2 = *(pSrcB + 3); /* B[2 * i + 3] */
acc3 = a2 * c2; /* acc3 = A[2 * i + 2] * B[2 * i + 2] */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
b2 = *(pSrcA + 3); /* A[2 * i + 3] */
acc2 += (a1 * d1); /* acc2 = A[2 * i + 1] * B[2 * i] + A[2 * i] * B[2 * i + 1] */
a1 = *(pSrcA + 4); /* A[2 * i + 4] */
acc4 = (a2 * d2); /* acc4 = A[2 * i + 2] * B[2 * i + 3] */
c1 = *(pSrcB + 4); /* B[2 * i + 4] */
acc3 -= (b2 * d2); /* acc3 = A[2 * i + 2] * B[2 * i + 2] - A[2 * i + 3] * B[2 * i + 3] */
*pDst = acc1; /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1] */
b1 = *(pSrcA + 5); /* A[2 * i + 5] */
acc4 += b2 * c2; /* acc4 = A[2 * i + 2] * B[2 * i + 3] + A[2 * i + 3] * B[2 * i + 2] */
*(pDst + 1) = acc2; /* C[2 * i + 1] = A[2 * i + 1] * B[2 * i] + A[2 * i] * B[2 * i + 1] */
acc1 = (a1 * c1);
d1 = *(pSrcB + 5);
acc2 = (b1 * c1);
*(pDst + 2) = acc3;
*(pDst + 3) = acc4;
a2 = *(pSrcA + 6);
acc1 -= (b1 * d1);
c2 = *(pSrcB + 6);
acc2 += (a1 * d1);
b2 = *(pSrcA + 7);
acc3 = (a2 * c2);
d2 = *(pSrcB + 7);
acc4 = (b2 * c2);
*(pDst + 4) = acc1;
pSrcA += 8U;
acc3 -= (b2 * d2);
acc4 += (a2 * d2);
*(pDst + 5) = acc2;
pSrcB += 8U;
*(pDst + 6) = acc3;
*(pDst + 7) = acc4;
pDst += 8U;
/* Decrement the numSamples loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
a1 = *pSrcA++;
b1 = *pSrcA++;
c1 = *pSrcB++;
d1 = *pSrcB++;
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
/* store the result in the destination buffer. */
*pDst++ = (a1 * c1) - (b1 * d1);
*pDst++ = (a1 * d1) + (b1 * c1);
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* Decrement the numSamples loop counter */
/* store result in destination buffer. */
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of CmplxByCmplxMult group
@} end of CmplxByCmplxMult group
*/

View File

@@ -0,0 +1,83 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mult_cmplx_f64.c
* Description: Floating-point complex-by-complex multiplication
*
* $Date: 13 September 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions.h"
/**
@ingroup groupCmplxMath
*/
/**
@addtogroup CmplxByCmplxMult
@{
*/
/**
@brief Floating-point complex-by-complex multiplication.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
*/
void arm_cmplx_mult_cmplx_f64(
const float64_t * pSrcA,
const float64_t * pSrcB,
float64_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
float64_t a, b, c, d; /* Temporary variables to store real and imaginary values */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (blkCnt > 0U)
{
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store result in destination buffer. */
*pDst++ = (a * c) - (b * d);
*pDst++ = (a * d) + (b * c);
/* Decrement loop counter */
blkCnt--;
}
}
/**
@} end of CmplxByCmplxMult group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_cmplx_q15.c
* Description: Q15 complex-by-complex multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,156 +26,233 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup CmplxByCmplxMult
* @{
@addtogroup CmplxByCmplxMult
@{
*/
/**
* @brief Q15 complex-by-complex multiplication
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] numSamples number of complex samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
@brief Q15 complex-by-complex multiplication.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_mult_cmplx_q15(
q15_t * pSrcA,
q15_t * pSrcB,
q15_t * pDst,
uint32_t numSamples)
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t numSamples)
{
q15_t a, b, c, d; /* Temporary variables to store real and imaginary values */
int32_t blkCnt;
q15x8_t vecSrcA, vecSrcB;
q15x8_t vecSrcC, vecSrcD;
q15x8_t vecDst;
#if defined (ARM_MATH_DSP)
blkCnt = (numSamples >> 3);
blkCnt -= 1;
if (blkCnt > 0)
{
/* should give more freedom to generate stall free code */
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
pSrcA += 8;
pSrcB += 8;
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counters */
while (blkCnt > 0)
{
/* loop Unrolling */
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 8;
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 8;
vstrhq_s16(pDst, vshrq(vecDst, 2));
pDst += 8;
vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 8;
vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 8;
vstrhq_s16(pDst, vshrq(vecDst, 2));
pDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
vstrhq_s16(pDst, vshrq(vecDst, 2));
pDst += 8;
vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD);
vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
vstrhq_s16(pDst, vshrq(vecDst, 2));
pDst += 8;
/*
* tail
*/
blkCnt = CMPLX_DIM * (numSamples & 7);
do
{
mve_pred16_t p = vctp16q(blkCnt);
pSrcA += 8;
pSrcB += 8;
vecSrcA = vldrhq_z_s16(pSrcA, p);
vecSrcB = vldrhq_z_s16(pSrcB, p);
vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p);
vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p);
vstrhq_p_s16(pDst, vecDst, p);
pDst += 8;
blkCnt -= 8;
}
while ((int32_t) blkCnt > 0);
}
else
{
blkCnt = numSamples * CMPLX_DIM;
while (blkCnt > 0) {
mve_pred16_t p = vctp16q(blkCnt);
vecSrcA = vldrhq_z_s16(pSrcA, p);
vecSrcB = vldrhq_z_s16(pSrcB, p);
vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p);
vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p);
vstrhq_p_s16(pDst, vecDst, p);
pDst += 8;
pSrcA += 8;
pSrcB += 8;
blkCnt -= 8;
}
}
}
#else
void arm_cmplx_mult_cmplx_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
q15_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
q15_t a, b, c, d; /* Temporary variables */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
/* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
*pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
*pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
*pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
*pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
*pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
/* Decrement the blockSize loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
/* store the result in 3.13 format in the destination buffer. */
*pDst++ =
(q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
/* store result in 3.13 format in destination buffer. */
*pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
*pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
/* Decrement the blockSize loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of CmplxByCmplxMult group
@} end of CmplxByCmplxMult group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_cmplx_q31.c
* Description: Q31 complex-by-complex multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,289 +26,228 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup CmplxByCmplxMult
* @{
@addtogroup CmplxByCmplxMult
@{
*/
/**
* @brief Q31 complex-by-complex multiplication
* @param[in] *pSrcA points to the first input vector
* @param[in] *pSrcB points to the second input vector
* @param[out] *pDst points to the output vector
* @param[in] numSamples number of complex samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
* Input down scaling is not required.
@brief Q31 complex-by-complex multiplication.
@param[in] pSrcA points to first input vector
@param[in] pSrcB points to second input vector
@param[out] pDst points to output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
Input down scaling is not required.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_mult_cmplx_q31(
q31_t * pSrcA,
q31_t * pSrcB,
q31_t * pDst,
uint32_t numSamples)
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t numSamples)
{
q31_t a, b, c, d; /* Temporary variables to store real and imaginary values */
uint32_t blkCnt; /* loop counters */
q31_t mul1, mul2, mul3, mul4;
q31_t out1, out2;
int32_t blkCnt;
q31x4_t vecSrcA, vecSrcB;
q31x4_t vecSrcC, vecSrcD;
q31x4_t vecDst;
#if defined (ARM_MATH_DSP)
blkCnt = numSamples >> 2;
blkCnt -= 1;
if (blkCnt > 0) {
/* should give more freedom to generate stall free code */
vecSrcA = vld1q(pSrcA);
vecSrcB = vld1q(pSrcB);
pSrcA += 4;
pSrcB += 4;
/* Run the below code for Cortex-M4 and Cortex-M3 */
while (blkCnt > 0) {
/* loop Unrolling */
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
pSrcA += 4;
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
pSrcB += 4;
vst1q(pDst, vshrq(vecDst, 2));
pDst += 4;
vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
vecSrcA = vld1q(pSrcA);
pSrcA += 4;
vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
vecSrcB = vld1q(pSrcB);
pSrcB += 4;
vst1q(pDst, vshrq(vecDst, 2));
pDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* process last elements out of the loop avoid the armclang breaking the SW pipeline */
vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
vecSrcC = vld1q(pSrcA);
vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
vecSrcD = vld1q(pSrcB);
vst1q(pDst, vshrq(vecDst, 2));
pDst += 4;
vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
vst1q(pDst, vshrq(vecDst, 2));
pDst += 4;
/*
* tail
*/
blkCnt = CMPLX_DIM * (numSamples & 3);
do {
mve_pred16_t p = vctp32q(blkCnt);
pSrcA += 4;
pSrcB += 4;
vecSrcA = vldrwq_z_s32(pSrcA, p);
vecSrcB = vldrwq_z_s32(pSrcB, p);
vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
vstrwq_p_s32(pDst, vecDst, p);
pDst += 4;
blkCnt -= 4;
}
while ((int32_t) blkCnt > 0);
} else {
blkCnt = numSamples * CMPLX_DIM;
while (blkCnt > 0) {
mve_pred16_t p = vctp32q(blkCnt);
vecSrcA = vldrwq_z_s32(pSrcA, p);
vecSrcB = vldrwq_z_s32(pSrcB, p);
vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
vstrwq_p_s32(pDst, vecDst, p);
pDst += 4;
pSrcA += 4;
pSrcB += 4;
blkCnt -= 4;
}
}
}
#else
void arm_cmplx_mult_cmplx_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
q31_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
q31_t a, b, c, d; /* Temporary variables */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
/* store result in 3.29 format in destination buffer. */
*pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
*pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
*pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
*pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
*pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
*pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
*pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
/* Decrement the blockSize loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
/* loop Unrolling */
blkCnt = numSamples >> 1U;
/* First part of the processing with loop unrolling. Compute 2 outputs at a time.
** a second loop below computes the remaining 1 sample. */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
/* Decrement the blockSize loop counter */
blkCnt--;
}
/* If the blockSize is not a multiple of 2, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x2U;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
/* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
/* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
a = *pSrcA++;
b = *pSrcA++;
c = *pSrcB++;
d = *pSrcB++;
mul1 = (q31_t) (((q63_t) a * c) >> 32);
mul2 = (q31_t) (((q63_t) b * d) >> 32);
mul3 = (q31_t) (((q63_t) a * d) >> 32);
mul4 = (q31_t) (((q63_t) b * c) >> 32);
/* store result in 3.29 format in destination buffer. */
*pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
*pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
mul1 = (mul1 >> 1);
mul2 = (mul2 >> 1);
mul3 = (mul3 >> 1);
mul4 = (mul4 >> 1);
out1 = mul1 - mul2;
out2 = mul3 + mul4;
/* store the real result in 3.29 format in the destination buffer. */
*pDst++ = out1;
/* store the imag result in 3.29 format in the destination buffer. */
*pDst++ = out2;
/* Decrement the blockSize loop counter */
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of CmplxByCmplxMult group
@} end of CmplxByCmplxMult group
*/

View File

@@ -0,0 +1,194 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_cmplx_mult_real_f16.c
* Description: Floating-point complex by real multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/complex_math_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupCmplxMath
*/
/**
@defgroup CmplxByRealMult Complex-by-Real Multiplication
Multiplies a complex vector by a real vector and generates a complex result.
The data in the complex arrays is stored in an interleaved fashion
(real, imag, real, imag, ...).
The parameter <code>numSamples</code> represents the number of complex
samples processed. The complex arrays have a total of <code>2*numSamples</code>
real values while the real array has a total of <code>numSamples</code>
real values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pCmplxDst[(2*n)+0] = pSrcCmplx[(2*n)+0] * pSrcReal[n];
pCmplxDst[(2*n)+1] = pSrcCmplx[(2*n)+1] * pSrcReal[n];
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
@addtogroup CmplxByRealMult
@{
*/
/**
@brief Floating-point complex-by-real multiplication.
@param[in] pSrcCmplx points to complex input vector
@param[in] pSrcReal points to real input vector
@param[out] pCmplxDst points to complex output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_mult_real_f16(
const float16_t * pSrcCmplx,
const float16_t * pSrcReal,
float16_t * pCmplxDst,
uint32_t numSamples)
{
static const uint16_t stride_cmplx_x_real_16[8] = {
0, 0, 1, 1, 2, 2, 3, 3
};
uint32_t blockSizeC = numSamples * CMPLX_DIM; /* loop counters */
uint32_t blkCnt;
f16x8_t rVec;
f16x8_t cmplxVec;
f16x8_t dstVec;
uint16x8_t strideVec;
/* stride vector for pairs of real generation */
strideVec = vld1q(stride_cmplx_x_real_16);
/* Compute 4 complex outputs at a time */
blkCnt = blockSizeC >> 3;
while (blkCnt > 0U)
{
cmplxVec = vld1q(pSrcCmplx);
rVec = vldrhq_gather_shifted_offset_f16(pSrcReal, strideVec);
dstVec = vmulq(cmplxVec, rVec);
vst1q(pCmplxDst, dstVec);
pSrcReal += 4;
pSrcCmplx += 8;
pCmplxDst += 8;
blkCnt--;
}
blkCnt = blockSizeC & 7;
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
cmplxVec = vld1q(pSrcCmplx);
rVec = vldrhq_gather_shifted_offset_f16(pSrcReal, strideVec);
dstVec = vmulq(cmplxVec, rVec);
vstrhq_p_f16(pCmplxDst, dstVec, p0);
}
}
#else
void arm_cmplx_mult_real_f16(
const float16_t * pSrcCmplx,
const float16_t * pSrcReal,
float16_t * pCmplxDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
float16_t in; /* Temporary variable */
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
in = *pSrcReal++;
/* store result in destination buffer. */
*pCmplxDst++ = (_Float16)*pSrcCmplx++ * (_Float16)in;
*pCmplxDst++ = (_Float16)*pSrcCmplx++ * (_Float16)in;
in = *pSrcReal++;
*pCmplxDst++ = (_Float16)*pSrcCmplx++ * (_Float16)in;
*pCmplxDst++ = (_Float16)*pSrcCmplx++ * (_Float16)in;
in = *pSrcReal++;
*pCmplxDst++ = (_Float16)*pSrcCmplx++ * (_Float16)in;
*pCmplxDst++ = (_Float16)*pSrcCmplx++ * (_Float16)in;
in = *pSrcReal++;
*pCmplxDst++ = (_Float16)*pSrcCmplx++ * (_Float16)in;
*pCmplxDst++ = (_Float16)*pSrcCmplx++ * (_Float16)in;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
in = *pSrcReal++;
/* store result in destination buffer. */
*pCmplxDst++ = (_Float16)*pSrcCmplx++ * (_Float16)in;
*pCmplxDst++ = (_Float16)*pSrcCmplx++ * (_Float16)in;
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of CmplxByRealMult group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_real_f32.c
* Description: Floating-point complex by real multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,188 +26,199 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @defgroup CmplxByRealMult Complex-by-Real Multiplication
*
* Multiplies a complex vector by a real vector and generates a complex result.
* The data in the complex arrays is stored in an interleaved fashion
* (real, imag, real, imag, ...).
* The parameter <code>numSamples</code> represents the number of complex
* samples processed. The complex arrays have a total of <code>2*numSamples</code>
* real values while the real array has a total of <code>numSamples</code>
* real values.
*
* The underlying algorithm is used:
*
* <pre>
* for(n=0; n<numSamples; n++) {
* pCmplxDst[(2*n)+0] = pSrcCmplx[(2*n)+0] * pSrcReal[n];
* pCmplxDst[(2*n)+1] = pSrcCmplx[(2*n)+1] * pSrcReal[n];
* }
* </pre>
*
* There are separate functions for floating-point, Q15, and Q31 data types.
@defgroup CmplxByRealMult Complex-by-Real Multiplication
Multiplies a complex vector by a real vector and generates a complex result.
The data in the complex arrays is stored in an interleaved fashion
(real, imag, real, imag, ...).
The parameter <code>numSamples</code> represents the number of complex
samples processed. The complex arrays have a total of <code>2*numSamples</code>
real values while the real array has a total of <code>numSamples</code>
real values.
The underlying algorithm is used:
<pre>
for (n = 0; n < numSamples; n++) {
pCmplxDst[(2*n)+0] = pSrcCmplx[(2*n)+0] * pSrcReal[n];
pCmplxDst[(2*n)+1] = pSrcCmplx[(2*n)+1] * pSrcReal[n];
}
</pre>
There are separate functions for floating-point, Q15, and Q31 data types.
*/
/**
* @addtogroup CmplxByRealMult
* @{
@addtogroup CmplxByRealMult
@{
*/
/**
* @brief Floating-point complex-by-real multiplication
* @param[in] *pSrcCmplx points to the complex input vector
* @param[in] *pSrcReal points to the real input vector
* @param[out] *pCmplxDst points to the complex output vector
* @param[in] numSamples number of samples in each vector
* @return none.
@brief Floating-point complex-by-real multiplication.
@param[in] pSrcCmplx points to complex input vector
@param[in] pSrcReal points to real input vector
@param[out] pCmplxDst points to complex output vector
@param[in] numSamples number of samples in each vector
@return none
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_mult_real_f32(
float32_t * pSrcCmplx,
float32_t * pSrcReal,
float32_t * pCmplxDst,
uint32_t numSamples)
const float32_t * pSrcCmplx,
const float32_t * pSrcReal,
float32_t * pCmplxDst,
uint32_t numSamples)
{
float32_t in; /* Temporary variable to store input value */
uint32_t blkCnt; /* loop counters */
static const uint32_t stride_cmplx_x_real_32[4] = { 0, 0, 1, 1 };
#if defined (ARM_MATH_DSP)
uint32_t blockSizeC = numSamples * CMPLX_DIM; /* loop counters */
uint32_t blkCnt;
f32x4_t rVec;
f32x4_t cmplxVec;
f32x4_t dstVec;
uint32x4_t strideVec;
float32_t in;
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t inA1, inA2, inA3, inA4; /* Temporary variables to hold input data */
float32_t inA5, inA6, inA7, inA8; /* Temporary variables to hold input data */
float32_t inB1, inB2, inB3, inB4; /* Temporary variables to hold input data */
float32_t out1, out2, out3, out4; /* Temporary variables to hold output data */
float32_t out5, out6, out7, out8; /* Temporary variables to hold output data */
/* loop Unrolling */
/* stride vector for pairs of real generation */
strideVec = vld1q(stride_cmplx_x_real_32);
/* Compute 4 complex outputs at a time */
blkCnt = blockSizeC >> 2;
while (blkCnt > 0U)
{
cmplxVec = vld1q(pSrcCmplx);
rVec = vldrwq_gather_shifted_offset_f32(pSrcReal, strideVec);
dstVec = vmulq(cmplxVec, rVec);
vst1q(pCmplxDst, dstVec);
pSrcReal += 2;
pSrcCmplx += 4;
pCmplxDst += 4;
blkCnt--;
}
blkCnt = (blockSizeC & 3) >> 1;
while (blkCnt > 0U)
{
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
in = *pSrcReal++;
/* store result in destination buffer. */
*pCmplxDst++ = *pSrcCmplx++ * in;
*pCmplxDst++ = *pSrcCmplx++ * in;
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_mult_real_f32(
const float32_t * pSrcCmplx,
const float32_t * pSrcReal,
float32_t * pCmplxDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
float32_t in; /* Temporary variable */
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4_t r;
float32x4x2_t ab,outCplx;
/* Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
ab = vld2q_f32(pSrcCmplx); // load & separate real/imag pSrcA (de-interleave 2)
r = vld1q_f32(pSrcReal); // load & separate real/imag pSrcB
/* Increment pointers */
pSrcCmplx += 8;
pSrcReal += 4;
outCplx.val[0] = vmulq_f32(ab.val[0], r);
outCplx.val[1] = vmulq_f32(ab.val[1], r);
vst2q_f32(pCmplxDst, outCplx);
pCmplxDst += 8;
blkCnt--;
}
/* Tail */
blkCnt = numSamples & 3;
#else
#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* read input from complex input buffer */
inA1 = pSrcCmplx[0];
inA2 = pSrcCmplx[1];
/* read input from real input buffer */
inB1 = pSrcReal[0];
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* read input from complex input buffer */
inA3 = pSrcCmplx[2];
in = *pSrcReal++;
/* store result in destination buffer. */
*pCmplxDst++ = *pSrcCmplx++ * in;
*pCmplxDst++ = *pSrcCmplx++ * in;
/* multiply complex buffer real input with real buffer input */
out1 = inA1 * inB1;
in = *pSrcReal++;
*pCmplxDst++ = *pSrcCmplx++ * in;
*pCmplxDst++ = *pSrcCmplx++ * in;
/* read input from complex input buffer */
inA4 = pSrcCmplx[3];
in = *pSrcReal++;
*pCmplxDst++ = *pSrcCmplx++ * in;
*pCmplxDst++ = *pSrcCmplx++ * in;
/* multiply complex buffer imaginary input with real buffer input */
out2 = inA2 * inB1;
in = *pSrcReal++;
*pCmplxDst++ = *pSrcCmplx++* in;
*pCmplxDst++ = *pSrcCmplx++ * in;
/* read input from real input buffer */
inB2 = pSrcReal[1];
/* read input from complex input buffer */
inA5 = pSrcCmplx[4];
/* multiply complex buffer real input with real buffer input */
out3 = inA3 * inB2;
/* read input from complex input buffer */
inA6 = pSrcCmplx[5];
/* read input from real input buffer */
inB3 = pSrcReal[2];
/* multiply complex buffer imaginary input with real buffer input */
out4 = inA4 * inB2;
/* read input from complex input buffer */
inA7 = pSrcCmplx[6];
/* multiply complex buffer real input with real buffer input */
out5 = inA5 * inB3;
/* read input from complex input buffer */
inA8 = pSrcCmplx[7];
/* multiply complex buffer imaginary input with real buffer input */
out6 = inA6 * inB3;
/* read input from real input buffer */
inB4 = pSrcReal[3];
/* store result to destination bufer */
pCmplxDst[0] = out1;
/* multiply complex buffer real input with real buffer input */
out7 = inA7 * inB4;
/* store result to destination bufer */
pCmplxDst[1] = out2;
/* multiply complex buffer imaginary input with real buffer input */
out8 = inA8 * inB4;
/* store result to destination bufer */
pCmplxDst[2] = out3;
pCmplxDst[3] = out4;
pCmplxDst[4] = out5;
/* incremnet complex input buffer by 8 to process next samples */
pSrcCmplx += 8U;
/* store result to destination bufer */
pCmplxDst[5] = out6;
/* increment real input buffer by 4 to process next samples */
pSrcReal += 4U;
/* store result to destination bufer */
pCmplxDst[6] = out7;
pCmplxDst[7] = out8;
/* increment destination buffer by 8 to process next sampels */
pCmplxDst += 8U;
/* Decrement the numSamples loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
#endif /* #if defined(ARM_MATH_NEON) */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
in = *pSrcReal++;
/* store the result in the destination buffer. */
*pCmplxDst++ = (*pSrcCmplx++) * (in);
*pCmplxDst++ = (*pSrcCmplx++) * (in);
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* Decrement the numSamples loop counter */
in = *pSrcReal++;
/* store result in destination buffer. */
*pCmplxDst++ = *pSrcCmplx++ * in;
*pCmplxDst++ = *pSrcCmplx++ * in;
/* Decrement loop counter */
blkCnt--;
}
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of CmplxByRealMult group
@} end of CmplxByRealMult group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_real_q15.c
* Description: Q15 complex by real multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,80 +26,129 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup CmplxByRealMult
* @{
@addtogroup CmplxByRealMult
@{
*/
/**
* @brief Q15 complex-by-real multiplication
* @param[in] *pSrcCmplx points to the complex input vector
* @param[in] *pSrcReal points to the real input vector
* @param[out] *pCmplxDst points to the complex output vector
* @param[in] numSamples number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
@brief Q15 complex-by-real multiplication.
@param[in] pSrcCmplx points to complex input vector
@param[in] pSrcReal points to real input vector
@param[out] pCmplxDst points to complex output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_mult_real_q15(
q15_t * pSrcCmplx,
q15_t * pSrcReal,
q15_t * pCmplxDst,
uint32_t numSamples)
const q15_t * pSrcCmplx,
const q15_t * pSrcReal,
q15_t * pCmplxDst,
uint32_t numSamples)
{
q15_t in; /* Temporary variable to store input value */
static const uint16_t stride_cmplx_x_real_16[8] = {
0, 0, 1, 1, 2, 2, 3, 3
};
q15x8_t rVec;
q15x8_t cmplxVec;
q15x8_t dstVec;
uint16x8_t strideVec;
uint32_t blockSizeC = numSamples * CMPLX_DIM; /* loop counters */
uint32_t blkCnt;
q15_t in;
#if defined (ARM_MATH_DSP)
/*
* stride vector for pairs of real generation
*/
strideVec = vld1q(stride_cmplx_x_real_16);
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counters */
q31_t inA1, inA2; /* Temporary variables to hold input data */
q31_t inB1; /* Temporary variables to hold input data */
q15_t out1, out2, out3, out4; /* Temporary variables to hold output data */
q31_t mul1, mul2, mul3, mul4; /* Temporary variables to hold intermediate data */
blkCnt = blockSizeC >> 3;
/* loop Unrolling */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
cmplxVec = vld1q(pSrcCmplx);
rVec = vldrhq_gather_shifted_offset_s16(pSrcReal, strideVec);
dstVec = vqdmulhq(cmplxVec, rVec);
vst1q(pCmplxDst, dstVec);
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
pSrcReal += 4;
pSrcCmplx += 8;
pCmplxDst += 8;
blkCnt --;
}
/* Tail */
blkCnt = (blockSizeC & 7) >> 1;
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* read complex number both real and imaginary from complex input buffer */
inA1 = *__SIMD32(pSrcCmplx)++;
/* read two real values at a time from real input buffer */
inB1 = *__SIMD32(pSrcReal)++;
/* read complex number both real and imaginary from complex input buffer */
inA2 = *__SIMD32(pSrcCmplx)++;
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
in = *pSrcReal++;
/* store the result in the destination buffer. */
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_mult_real_q15(
const q15_t * pSrcCmplx,
const q15_t * pSrcReal,
q15_t * pCmplxDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
q15_t in; /* Temporary variable */
#if defined (ARM_MATH_LOOPUNROLL)
#if defined (ARM_MATH_DSP)
q31_t inA1, inA2; /* Temporary variables to hold input data */
q31_t inB1; /* Temporary variables to hold input data */
q15_t out1, out2, out3, out4; /* Temporary variables to hold output data */
q31_t mul1, mul2, mul3, mul4; /* Temporary variables to hold intermediate data */
#endif
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
#if defined (ARM_MATH_DSP)
/* read 2 complex numbers both real and imaginary from complex input buffer */
inA1 = read_q15x2_ia (&pSrcCmplx);
inA2 = read_q15x2_ia (&pSrcCmplx);
/* read 2 real values at a time from real input buffer */
inB1 = read_q15x2_ia (&pSrcReal);
/* multiply complex number with real numbers */
#ifndef ARM_MATH_BIG_ENDIAN
mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1));
mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB1 >> 16));
#else
mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) inB1);
mul3 = (q31_t) ((q15_t) inA2 * (q15_t) inB1);
mul3 = (q31_t) ((q15_t) inA2 * (q15_t) inB1);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* saturate the result */
@@ -109,27 +158,23 @@ void arm_cmplx_mult_real_q15(
out4 = (q15_t) __SSAT(mul4 >> 15U, 16);
/* pack real and imaginary outputs and store them to destination */
*__SIMD32(pCmplxDst)++ = __PKHBT(out1, out2, 16);
*__SIMD32(pCmplxDst)++ = __PKHBT(out3, out4, 16);
write_q15x2_ia (&pCmplxDst, __PKHBT(out1, out2, 16));
write_q15x2_ia (&pCmplxDst, __PKHBT(out3, out4, 16));
inA1 = *__SIMD32(pSrcCmplx)++;
inB1 = *__SIMD32(pSrcReal)++;
inA2 = *__SIMD32(pSrcCmplx)++;
inA1 = read_q15x2_ia (&pSrcCmplx);
inA2 = read_q15x2_ia (&pSrcCmplx);
inB1 = read_q15x2_ia (&pSrcReal);
#ifndef ARM_MATH_BIG_ENDIAN
mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1));
mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB1 >> 16));
#else
mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) inB1);
mul3 = (q31_t) ((q15_t) inA2 * (q15_t) inB1);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
out1 = (q15_t) __SSAT(mul1 >> 15U, 16);
@@ -137,55 +182,57 @@ void arm_cmplx_mult_real_q15(
out3 = (q15_t) __SSAT(mul3 >> 15U, 16);
out4 = (q15_t) __SSAT(mul4 >> 15U, 16);
*__SIMD32(pCmplxDst)++ = __PKHBT(out1, out2, 16);
*__SIMD32(pCmplxDst)++ = __PKHBT(out3, out4, 16);
/* Decrement the numSamples loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
write_q15x2_ia (&pCmplxDst, __PKHBT(out1, out2, 16));
write_q15x2_ia (&pCmplxDst, __PKHBT(out3, out4, 16));
#else
in = *pSrcReal++;
/* store the result in the destination buffer. */
*pCmplxDst++ =
(q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
*pCmplxDst++ =
(q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
/* Decrement the numSamples loop counter */
in = *pSrcReal++;
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
in = *pSrcReal++;
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
in = *pSrcReal++;
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* realOut = realA * realB. */
/* imagOut = imagA * realB. */
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
in = *pSrcReal++;
/* store the result in the destination buffer. */
*pCmplxDst++ =
(q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
*pCmplxDst++ =
(q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
*pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
/* Decrement the numSamples loop counter */
numSamples--;
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of CmplxByRealMult group
@} end of CmplxByRealMult group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_mult_real_q31.c
* Description: Q31 complex by real multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,186 +26,179 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/complex_math_functions.h"
/**
* @ingroup groupCmplxMath
@ingroup groupCmplxMath
*/
/**
* @addtogroup CmplxByRealMult
* @{
@addtogroup CmplxByRealMult
@{
*/
/**
* @brief Q31 complex-by-real multiplication
* @param[in] *pSrcCmplx points to the complex input vector
* @param[in] *pSrcReal points to the real input vector
* @param[out] *pCmplxDst points to the complex output vector
* @param[in] numSamples number of samples in each vector
* @return none.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
@brief Q31 complex-by-real multiplication.
@param[in] pSrcCmplx points to complex input vector
@param[in] pSrcReal points to real input vector
@param[out] pCmplxDst points to complex output vector
@param[in] numSamples number of samples in each vector
@return none
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_mult_real_q31(
q31_t * pSrcCmplx,
q31_t * pSrcReal,
q31_t * pCmplxDst,
uint32_t numSamples)
const q31_t * pSrcCmplx,
const q31_t * pSrcReal,
q31_t * pCmplxDst,
uint32_t numSamples)
{
q31_t inA1; /* Temporary variable to store input value */
#if defined (ARM_MATH_DSP)
static const uint32_t stride_cmplx_x_real_32[4] = {
0, 0, 1, 1
};
q31x4_t rVec;
q31x4_t cmplxVec;
q31x4_t dstVec;
uint32x4_t strideVec;
uint32_t blockSizeC = numSamples * CMPLX_DIM; /* loop counters */
uint32_t blkCnt;
q31_t in;
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint32_t blkCnt; /* loop counters */
q31_t inA2, inA3, inA4; /* Temporary variables to hold input data */
q31_t inB1, inB2; /* Temporary variabels to hold input data */
q31_t out1, out2, out3, out4; /* Temporary variables to hold output data */
/*
* stride vector for pairs of real generation
*/
strideVec = vld1q(stride_cmplx_x_real_32);
/* loop Unrolling */
/* Compute 4 complex outputs at a time */
blkCnt = blockSizeC >> 2;
while (blkCnt > 0U)
{
cmplxVec = vld1q(pSrcCmplx);
rVec = vldrwq_gather_shifted_offset_s32(pSrcReal, strideVec);
dstVec = vqdmulhq(cmplxVec, rVec);
vst1q(pCmplxDst, dstVec);
pSrcReal += 2;
pSrcCmplx += 4;
pCmplxDst += 4;
blkCnt --;
}
blkCnt = (blockSizeC & 3) >> 1;
while (blkCnt > 0U)
{
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
in = *pSrcReal++;
/* store saturated result in 1.31 format to destination buffer */
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
/* Decrement loop counter */
blkCnt--;
}
}
#else
void arm_cmplx_mult_real_q31(
const q31_t * pSrcCmplx,
const q31_t * pSrcReal,
q31_t * pCmplxDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
q31_t in; /* Temporary variable */
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* read real input from complex input buffer */
inA1 = *pSrcCmplx++;
inA2 = *pSrcCmplx++;
/* read input from real input bufer */
inB1 = *pSrcReal++;
inB2 = *pSrcReal++;
/* read imaginary input from complex input buffer */
inA3 = *pSrcCmplx++;
inA4 = *pSrcCmplx++;
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* multiply complex input with real input */
out1 = ((q63_t) inA1 * inB1) >> 32;
out2 = ((q63_t) inA2 * inB1) >> 32;
out3 = ((q63_t) inA3 * inB2) >> 32;
out4 = ((q63_t) inA4 * inB2) >> 32;
in = *pSrcReal++;
#if defined (ARM_MATH_DSP)
/* store saturated result in 1.31 format to destination buffer */
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
#else
/* store result in destination buffer. */
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
#endif
/* sature the result */
out1 = __SSAT(out1, 31);
out2 = __SSAT(out2, 31);
out3 = __SSAT(out3, 31);
out4 = __SSAT(out4, 31);
in = *pSrcReal++;
#if defined (ARM_MATH_DSP)
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
#else
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
#endif
/* get result in 1.31 format */
out1 = out1 << 1;
out2 = out2 << 1;
out3 = out3 << 1;
out4 = out4 << 1;
in = *pSrcReal++;
#if defined (ARM_MATH_DSP)
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
#else
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
#endif
/* store the result to destination buffer */
*pCmplxDst++ = out1;
*pCmplxDst++ = out2;
*pCmplxDst++ = out3;
*pCmplxDst++ = out4;
in = *pSrcReal++;
#if defined (ARM_MATH_DSP)
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
#else
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
#endif
/* read real input from complex input buffer */
inA1 = *pSrcCmplx++;
inA2 = *pSrcCmplx++;
/* read input from real input bufer */
inB1 = *pSrcReal++;
inB2 = *pSrcReal++;
/* read imaginary input from complex input buffer */
inA3 = *pSrcCmplx++;
inA4 = *pSrcCmplx++;
/* multiply complex input with real input */
out1 = ((q63_t) inA1 * inB1) >> 32;
out2 = ((q63_t) inA2 * inB1) >> 32;
out3 = ((q63_t) inA3 * inB2) >> 32;
out4 = ((q63_t) inA4 * inB2) >> 32;
/* sature the result */
out1 = __SSAT(out1, 31);
out2 = __SSAT(out2, 31);
out3 = __SSAT(out3, 31);
out4 = __SSAT(out4, 31);
/* get result in 1.31 format */
out1 = out1 << 1;
out2 = out2 << 1;
out3 = out3 << 1;
out4 = out4 << 1;
/* store the result to destination buffer */
*pCmplxDst++ = out1;
*pCmplxDst++ = out2;
*pCmplxDst++ = out3;
*pCmplxDst++ = out4;
/* Decrement the numSamples loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
{
/* C[2 * i] = A[2 * i] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* read real input from complex input buffer */
inA1 = *pSrcCmplx++;
inA2 = *pSrcCmplx++;
/* read input from real input bufer */
inB1 = *pSrcReal++;
/* multiply complex input with real input */
out1 = ((q63_t) inA1 * inB1) >> 32;
out2 = ((q63_t) inA2 * inB1) >> 32;
/* sature the result */
out1 = __SSAT(out1, 31);
out2 = __SSAT(out2, 31);
/* get result in 1.31 format */
out1 = out1 << 1;
out2 = out2 << 1;
/* store the result to destination buffer */
*pCmplxDst++ = out1;
*pCmplxDst++ = out2;
/* Decrement the numSamples loop counter */
blkCnt--;
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
while (numSamples > 0U)
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* realOut = realA * realB. */
/* imagReal = imagA * realB. */
inA1 = *pSrcReal++;
/* store the result in the destination buffer. */
*pCmplxDst++ =
(q31_t) clip_q63_to_q31(((q63_t) * pSrcCmplx++ * inA1) >> 31);
*pCmplxDst++ =
(q31_t) clip_q63_to_q31(((q63_t) * pSrcCmplx++ * inA1) >> 31);
/* C[2 * i ] = A[2 * i ] * B[i]. */
/* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
/* Decrement the numSamples loop counter */
numSamples--;
in = *pSrcReal++;
#if defined (ARM_MATH_DSP)
/* store saturated result in 1.31 format to destination buffer */
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
*pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
#else
/* store result in destination buffer. */
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
*pCmplxDst++ = (q31_t) clip_q63_to_q31(((q63_t) *pSrcCmplx++ * in) >> 31);
#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of CmplxByRealMult group
@} end of CmplxByRealMult group
*/