инициализация spi перенесена на cube (работает)

убрана кастомная библиотека periph_general
This commit is contained in:
2025-06-27 15:28:58 +03:00
parent 981dbf9bfa
commit a9a2466359
1587 changed files with 403396 additions and 250770 deletions

View File

@@ -0,0 +1,54 @@
cmake_minimum_required (VERSION 3.14)
project(CMSISDSPMatrix)
include(configLib)
include(configDsp)
file(GLOB SRCF64 "./*_f64.c")
file(GLOB SRCF32 "./*_f32.c")
file(GLOB SRCF16 "./*_f16.c")
file(GLOB SRCQ31 "./*_q31.c")
file(GLOB SRCQ15 "./*_q15.c")
file(GLOB SRCQ7 "./*_q7.c")
file(GLOB SRCU32 "./*_u32.c")
file(GLOB SRCU16 "./*_u16.c")
file(GLOB SRCU8 "./*_u8.c")
add_library(CMSISDSPMatrix STATIC ${SRCF64})
target_sources(CMSISDSPMatrix PRIVATE ${SRCF32})
if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
target_sources(CMSISDSPMatrix PRIVATE ${SRCF16})
endif()
target_sources(CMSISDSPMatrix PRIVATE ${SRCQ31})
target_sources(CMSISDSPMatrix PRIVATE ${SRCQ15})
target_sources(CMSISDSPMatrix PRIVATE ${SRCQ7})
target_sources(CMSISDSPMatrix PRIVATE ${SRCU32})
target_sources(CMSISDSPMatrix PRIVATE ${SRCU16})
target_sources(CMSISDSPMatrix PRIVATE ${SRCU8})
configLib(CMSISDSPMatrix ${ROOT})
configDsp(CMSISDSPMatrix ${ROOT})
### Includes
target_include_directories(CMSISDSPMatrix PUBLIC "${DSP}/Include")
if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
target_sources(CMSISDSPMatrix PRIVATE arm_mat_add_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_sub_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_trans_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_scale_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_mult_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_vec_mult_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_cmplx_trans_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_cmplx_mult_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_inverse_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_init_f16.c)
target_sources(CMSISDSPMatrix PRIVATE arm_mat_cholesky_f16.c)
endif()

View File

@@ -0,0 +1,74 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: MatrixFunctions.c
* Description: Combination of all matrix function source files.
*
* $Date: 18. March 2019
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_mat_add_f32.c"
#include "arm_mat_add_q15.c"
#include "arm_mat_add_q31.c"
#include "arm_mat_cmplx_mult_f32.c"
#include "arm_mat_cmplx_mult_q15.c"
#include "arm_mat_cmplx_mult_q31.c"
#include "arm_mat_init_f32.c"
#include "arm_mat_init_q15.c"
#include "arm_mat_init_q31.c"
#include "arm_mat_inverse_f32.c"
#include "arm_mat_inverse_f64.c"
#include "arm_mat_mult_f64.c"
#include "arm_mat_mult_f32.c"
#include "arm_mat_mult_fast_q15.c"
#include "arm_mat_mult_fast_q31.c"
#include "arm_mat_mult_q7.c"
#include "arm_mat_mult_q15.c"
#include "arm_mat_mult_q31.c"
#include "arm_mat_mult_opt_q31.c"
#include "arm_mat_scale_f32.c"
#include "arm_mat_scale_q15.c"
#include "arm_mat_scale_q31.c"
#include "arm_mat_sub_f64.c"
#include "arm_mat_sub_f32.c"
#include "arm_mat_sub_q15.c"
#include "arm_mat_sub_q31.c"
#include "arm_mat_trans_f32.c"
#include "arm_mat_trans_f64.c"
#include "arm_mat_trans_q7.c"
#include "arm_mat_trans_q15.c"
#include "arm_mat_trans_q31.c"
#include "arm_mat_vec_mult_f32.c"
#include "arm_mat_vec_mult_q31.c"
#include "arm_mat_vec_mult_q15.c"
#include "arm_mat_vec_mult_q7.c"
#include "arm_mat_cmplx_trans_f32.c"
#include "arm_mat_cmplx_trans_q31.c"
#include "arm_mat_cmplx_trans_q15.c"
#include "arm_mat_cholesky_f64.c"
#include "arm_mat_cholesky_f32.c"
#include "arm_mat_solve_upper_triangular_f32.c"
#include "arm_mat_solve_lower_triangular_f32.c"
#include "arm_mat_solve_upper_triangular_f64.c"
#include "arm_mat_solve_lower_triangular_f64.c"
#include "arm_mat_ldlt_f32.c"
#include "arm_mat_ldlt_f64.c"

View File

@@ -0,0 +1,41 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: MatrixFunctions.c
* Description: Combination of all matrix function f16 source files.
*
* $Date: 18. March 2020
* $Revision: V1.0.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "arm_mat_add_f16.c"
#include "arm_mat_sub_f16.c"
#include "arm_mat_trans_f16.c"
#include "arm_mat_scale_f16.c"
#include "arm_mat_mult_f16.c"
#include "arm_mat_vec_mult_f16.c"
#include "arm_mat_cmplx_trans_f16.c"
#include "arm_mat_cmplx_mult_f16.c"
#include "arm_mat_inverse_f16.c"
#include "arm_mat_init_f16.c"
#include "arm_mat_cholesky_f16.c"
#include "arm_mat_solve_upper_triangular_f16.c"
#include "arm_mat_solve_lower_triangular_f16.c"

View File

@@ -0,0 +1,217 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_add_f16.c
* Description: Floating-point matrix addition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixAdd
@{
*/
/**
@brief Floating-point matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_add_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
arm_status status;
uint32_t numSamples; /* total number of elements in the matrix */
float16_t *pDataA, *pDataB, *pDataDst;
f16x8_t vecA, vecB, vecDst;
float16_t const *pSrcAVec;
float16_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (float16_t const *) pDataA;
pSrcBVec = (float16_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vld1q(pSrcBVec);
pSrcBVec += 8;
vecDst = vaddq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
vecDst = vaddq_m(vecDst, vecA, vecB, p0);
vstrhq_p(pDataDst, vecDst, p0);
}
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
return (status);
}
#else
arm_status arm_mat_add_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
*pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
*pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
*pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = (_Float16)*pInA++ + (_Float16)*pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixAdd group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_add_f32.c
* Description: Floating-point matrix addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,38 +26,118 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @defgroup MatrixAdd Matrix Addition
*
* Adds two matrices.
* \image html MatrixAddition.gif "Addition of two 3 x 3 matrices"
*
* The functions check to make sure that
* <code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
* number of rows and columns.
@defgroup MatrixAdd Matrix Addition
Adds two matrices.
\image html MatrixAddition.gif "Addition of two 3 x 3 matrices"
The functions check to make sure that
<code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
number of rows and columns.
*/
/**
* @addtogroup MatrixAdd
* @{
@addtogroup MatrixAdd
@{
*/
/**
* @brief Floating-point matrix addition.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@brief Floating-point matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
arm_status status;
uint32_t numSamples; /* total number of elements in the matrix */
float32_t *pDataA, *pDataB, *pDataDst;
f32x4_t vecA, vecB, vecDst;
float32_t const *pSrcAVec;
float32_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (float32_t const *) pDataA;
pSrcBVec = (float32_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vaddq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
vecDst = vaddq_m(vecDst, vecA, vecB, p0);
vstrwq_p(pDataDst, vecDst, p0);
}
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
return (status);
}
#else
#if defined(ARM_MATH_NEON)
/*
Neon version is assuming the matrix is small enough.
So no blocking is used for taking into account cache effects.
For big matrix, there exist better libraries for Neon.
*/
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
@@ -67,11 +147,6 @@ arm_status arm_mat_add_f32(
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
#if defined (ARM_MATH_DSP)
float32_t inA1, inA2, inB1, inB2, out1, out2; /* temporary variables */
#endif // #if defined (ARM_MATH_DSP)
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
@@ -89,69 +164,27 @@ arm_status arm_mat_add_f32(
else
#endif
{
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_DSP)
/* Loop unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
/* Read values from source A */
inA1 = pIn1[0];
vec1 = vld1q_f32(pIn1);
vec2 = vld1q_f32(pIn2);
res = vaddq_f32(vec1, vec2);
vst1q_f32(pOut, res);
/* Read values from source B */
inB1 = pIn2[0];
/* Read values from source A */
inA2 = pIn1[1];
/* out = sourceA + sourceB */
out1 = inA1 + inB1;
/* Read values from source B */
inB2 = pIn2[1];
/* Read values from source A */
inA1 = pIn1[2];
/* out = sourceA + sourceB */
out2 = inA2 + inB2;
/* Read values from source B */
inB1 = pIn2[2];
/* Store result in destination */
pOut[0] = out1;
pOut[1] = out2;
/* Read values from source A */
inA2 = pIn1[3];
/* Read values from source B */
inB2 = pIn2[3];
/* out = sourceA + sourceB */
out1 = inA1 + inB1;
/* out = sourceA + sourceB */
out2 = inA2 + inB2;
/* Store result in destination */
pOut[2] = out1;
/* Store result in destination */
pOut[3] = out2;
/* update pointers to process next sampels */
/* update pointers to process next samples */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
@@ -163,15 +196,6 @@ arm_status arm_mat_add_f32(
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
@@ -184,13 +208,97 @@ arm_status arm_mat_add_f32(
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_add_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
*pOut++ = *pInA++ + *pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and store result in destination buffer. */
*pOut++ = *pInA++ + *pInB++;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixAdd group
@} end of MatrixAdd group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_add_q15.c
* Description: Q15 matrix addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,119 +26,100 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixAdd
* @{
@addtogroup MatrixAdd
@{
*/
/**
* @brief Q15 matrix addition.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
@brief Q15 matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_add_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
arm_matrix_instance_q15 * pDst)
{
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
uint32_t numSamples; /* total number of elements in the matrix */
q15_t *pDataA, *pDataB, *pDataDst;
q15x8_t vecA, vecB, vecDst;
q15_t const *pSrcAVec;
q15_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (q15_t const *) pDataA;
pSrcBVec = (q15_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in the input matrix */
numSamples = (uint16_t) (pSrcA->numRows * pSrcA->numCols);
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Loop unrolling */
blkCnt = (uint32_t) numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, Saturate and then store the results in the destination buffer. */
*__SIMD32(pOut)++ = __QADD16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
*__SIMD32(pOut)++ = __QADD16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
/* Decrement the loop counter */
blkCnt--;
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec); pSrcAVec += 8;
vecB = vld1q(pSrcBVec); pSrcBVec += 8;
vecDst = vqaddq(vecA, vecB);
vst1q(pDataDst, vecDst); pDataDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = (uint32_t) numSamples % 0x4U;
/* q15 pointers of input and output are initialized */
while (blkCnt > 0U)
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, Saturate and then store the results in the destination buffer. */
*pOut++ = (q15_t) __QADD16(*pInA++, *pInB++);
/* Decrement the loop counter */
blkCnt--;
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec); pSrcAVec += 8;
vecB = vld1q(pSrcBVec); pSrcBVec += 8;
vecDst = vqaddq_m(vecDst, vecA, vecB, p0);
vstrhq_p(pDataDst, vecDst, p0);
}
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = (uint32_t) numSamples;
/* q15 pointers of input and output are initialized */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, Saturate and then store the results in the destination buffer. */
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ + *pInB++), 16);
/* Decrement the loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -146,6 +127,101 @@ arm_status arm_mat_add_q15(
return (status);
}
#else
arm_status arm_mat_add_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
write_q15x2_ia (&pOut, __QADD16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
write_q15x2_ia (&pOut, __QADD16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
#else
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pOut++ = (q15_t) __QADD16(*pInA++, *pInB++);
#else
*pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixAdd group
@} end of MatrixAdd group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_add_q31.c
* Description: Q31 matrix addition
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,163 +26,99 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixAdd
* @{
@addtogroup MatrixAdd
@{
*/
/**
* @brief Q31 matrix addition.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
*/
@brief Q31 matrix addition.
@param[in] pSrcA points to first input matrix structure
@param[in] pSrcB points to second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_add_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t inA1, inB1; /* temporary variables */
arm_status status; /* status of matrix addition */
uint32_t numSamples; /* total number of elements in the matrix */
q31_t *pDataA, *pDataB, *pDataDst;
q31x4_t vecA, vecB, vecDst;
q31_t const *pSrcAVec;
q31_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
#if defined (ARM_MATH_DSP)
q31_t inA2, inB2; /* temporary variables */
q31_t out1, out2; /* temporary variables */
#endif // #if defined (ARM_MATH_DSP)
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (q31_t const *) pDataA;
pSrcBVec = (q31_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in the input matrix */
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Loop Unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and then store the results in the destination buffer. */
/* Read values from source A */
inA1 = pIn1[0];
/* Read values from source B */
inB1 = pIn2[0];
/* Read values from source A */
inA2 = pIn1[1];
/* Add and saturate */
out1 = __QADD(inA1, inB1);
/* Read values from source B */
inB2 = pIn2[1];
/* Read values from source A */
inA1 = pIn1[2];
/* Add and saturate */
out2 = __QADD(inA2, inB2);
/* Read values from source B */
inB1 = pIn2[2];
/* Store result in destination */
pOut[0] = out1;
pOut[1] = out2;
/* Read values from source A */
inA2 = pIn1[3];
/* Read values from source B */
inB2 = pIn2[3];
/* Add and saturate */
out1 = __QADD(inA1, inB1);
out2 = __QADD(inA2, inB2);
/* Store result in destination */
pOut[2] = out1;
pOut[3] = out2;
/* update pointers to process next sampels */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
/* Decrement the loop counter */
blkCnt--;
/* C(m,n) = A(m,n) + B(m,n) */
/* Add and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vqaddq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
/*
* tail
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and then store the results in the destination buffer. */
inA1 = *pIn1++;
inB1 = *pIn2++;
inA1 = __QADD(inA1, inB1);
/* Decrement the loop counter */
blkCnt--;
*pOut++ = inA1;
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vqaddq_m(vecDst, vecA, vecB, p0);
vstrwq_p(pDataDst, vecDst, p0);
}
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -190,6 +126,91 @@ arm_status arm_mat_add_q31(
return (status);
}
#else
arm_status arm_mat_add_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix addition */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
*pOut++ = __QADD(*pInA++, *pInB++);
*pOut++ = __QADD(*pInA++, *pInB++);
*pOut++ = __QADD(*pInA++, *pInB++);
*pOut++ = __QADD(*pInA++, *pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* Add, saturate and store result in destination buffer. */
*pOut++ = __QADD(*pInA++, *pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixAdd group
@} end of MatrixAdd group
*/

View File

@@ -0,0 +1,256 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cholesky_f16.c
* Description: Floating-point Cholesky decomposition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixChol
@{
*/
/**
* @brief Floating-point Cholesky decomposition of positive-definite matrix.
* @param[in] pSrc points to the instance of the input floating-point matrix structure.
* @param[out] pDst points to the instance of the output floating-point matrix structure.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* @return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
* @par
* If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
* The decomposition of A is returning a lower triangular matrix U such that A = U U^t
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_cholesky_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k;
int n = pSrc->numRows;
_Float16 invSqrtVj;
float16_t *pA,*pG;
int kCnt;
mve_pred16_t p0;
f16x8_t acc, acc0, acc1, acc2, acc3;
f16x8_t vecGi;
f16x8_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ;i < n ; i++)
{
for(j=i ; j+3 < n ; j+=4)
{
acc0 = vdupq_n_f16(0.0f16);
acc0[0]=pA[(j + 0) * n + i];
acc1 = vdupq_n_f16(0.0f16);
acc1[0]=pA[(j + 1) * n + i];
acc2 = vdupq_n_f16(0.0f16);
acc2[0]=pA[(j + 2) * n + i];
acc3 = vdupq_n_f16(0.0f16);
acc3[0]=pA[(j + 3) * n + i];
kCnt = i;
for(k=0; k < i ; k+=8)
{
p0 = vctp16q(kCnt);
vecGi=vldrhq_z_f16(&pG[i * n + k],p0);
vecGj0=vldrhq_z_f16(&pG[(j + 0) * n + k],p0);
vecGj1=vldrhq_z_f16(&pG[(j + 1) * n + k],p0);
vecGj2=vldrhq_z_f16(&pG[(j + 2) * n + k],p0);
vecGj3=vldrhq_z_f16(&pG[(j + 3) * n + k],p0);
acc0 = vfmsq_m(acc0, vecGi, vecGj0, p0);
acc1 = vfmsq_m(acc1, vecGi, vecGj1, p0);
acc2 = vfmsq_m(acc2, vecGi, vecGj2, p0);
acc3 = vfmsq_m(acc3, vecGi, vecGj3, p0);
kCnt -= 8;
}
pG[(j + 0) * n + i] = vecAddAcrossF16Mve(acc0);
pG[(j + 1) * n + i] = vecAddAcrossF16Mve(acc1);
pG[(j + 2) * n + i] = vecAddAcrossF16Mve(acc2);
pG[(j + 3) * n + i] = vecAddAcrossF16Mve(acc3);
}
for(; j < n ; j++)
{
kCnt = i;
acc = vdupq_n_f16(0.0f16);
acc[0] = pA[j * n + i];
for(k=0; k < i ; k+=8)
{
p0 = vctp16q(kCnt);
vecGi=vldrhq_z_f16(&pG[i * n + k],p0);
vecGj=vldrhq_z_f16(&pG[j * n + k],p0);
acc = vfmsq_m(acc, vecGi, vecGj,p0);
kCnt -= 8;
}
pG[j * n + i] = vecAddAcrossF16Mve(acc);
}
if ((_Float16)pG[i * n + i] <= 0.0f16)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
invSqrtVj = 1.0f16/(_Float16)sqrtf((float32_t)pG[i * n + i]);
for(j=i; j < n ; j++)
{
pG[j * n + i] = (_Float16)pG[j * n + i] * (_Float16)invSqrtVj ;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_cholesky_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k;
int n = pSrc->numRows;
float16_t invSqrtVj;
float16_t *pA,*pG;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ; i < n ; i++)
{
for(j=i ; j < n ; j++)
{
pG[j * n + i] = pA[j * n + i];
for(k=0; k < i ; k++)
{
pG[j * n + i] = (_Float16)pG[j * n + i] - (_Float16)pG[i * n + k] * (_Float16)pG[j * n + k];
}
}
if ((_Float16)pG[i * n + i] <= 0.0f16)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
/* The division is done in float32 for accuracy reason and
because doing it in f16 would not have any impact on the performances.
*/
invSqrtVj = 1.0f/sqrtf((float32_t)pG[i * n + i]);
for(j=i ; j < n ; j++)
{
pG[j * n + i] = (_Float16)pG[j * n + i] * (_Float16)invSqrtVj ;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixChol group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,438 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cholesky_f32.c
* Description: Floating-point Cholesky decomposition
*
* $Date: 05 October 2021
* $Revision: V1.9.1
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixChol Cholesky and LDLT decompositions
Computes the Cholesky or LDL^t decomposition of a matrix.
If the input matrix does not have a decomposition, then the
algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE.
*/
/**
@addtogroup MatrixChol
@{
*/
/**
* @brief Floating-point Cholesky decomposition of positive-definite matrix.
* @param[in] pSrc points to the instance of the input floating-point matrix structure.
* @param[out] pDst points to the instance of the output floating-point matrix structure.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* @return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
* @par
* If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
* The decomposition of A is returning a lower triangular matrix U such that A = U U^t
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_cholesky_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k;
int n = pSrc->numRows;
float32_t invSqrtVj;
float32_t *pA,*pG;
int kCnt;
mve_pred16_t p0;
f32x4_t acc, acc0, acc1, acc2, acc3;
f32x4_t vecGi;
f32x4_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ;i < n ; i++)
{
for(j=i ; j+3 < n ; j+=4)
{
pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
kCnt = i;
acc0 = vdupq_n_f32(0.0f);
acc1 = vdupq_n_f32(0.0f);
acc2 = vdupq_n_f32(0.0f);
acc3 = vdupq_n_f32(0.0f);
for(k=0; k < i ; k+=4)
{
p0 = vctp32q(kCnt);
vecGi=vldrwq_z_f32(&pG[i * n + k],p0);
vecGj0=vldrwq_z_f32(&pG[(j + 0) * n + k],p0);
vecGj1=vldrwq_z_f32(&pG[(j + 1) * n + k],p0);
vecGj2=vldrwq_z_f32(&pG[(j + 2) * n + k],p0);
vecGj3=vldrwq_z_f32(&pG[(j + 3) * n + k],p0);
acc0 = vfmaq_m(acc0, vecGi, vecGj0, p0);
acc1 = vfmaq_m(acc1, vecGi, vecGj1, p0);
acc2 = vfmaq_m(acc2, vecGi, vecGj2, p0);
acc3 = vfmaq_m(acc3, vecGi, vecGj3, p0);
kCnt -= 4;
}
pG[(j + 0) * n + i] -= vecAddAcrossF32Mve(acc0);
pG[(j + 1) * n + i] -= vecAddAcrossF32Mve(acc1);
pG[(j + 2) * n + i] -= vecAddAcrossF32Mve(acc2);
pG[(j + 3) * n + i] -= vecAddAcrossF32Mve(acc3);
}
for(; j < n ; j++)
{
pG[j * n + i] = pA[j * n + i];
kCnt = i;
acc = vdupq_n_f32(0.0f);
for(k=0; k < i ; k+=4)
{
p0 = vctp32q(kCnt);
vecGi=vldrwq_z_f32(&pG[i * n + k],p0);
vecGj=vldrwq_z_f32(&pG[j * n + k],p0);
acc = vfmaq_m(acc, vecGi, vecGj,p0);
kCnt -= 4;
}
pG[j * n + i] -= vecAddAcrossF32Mve(acc);
}
if (pG[i * n + i] <= 0.0f)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
for(j=i; j < n ; j++)
{
pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_cholesky_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k;
int n = pSrc->numRows;
float32_t invSqrtVj;
float32_t *pA,*pG;
int kCnt;
f32x4_t acc, acc0, acc1, acc2, acc3;
f32x4_t vecGi;
f32x4_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
#if !defined(__aarch64__)
f32x2_t tmp = vdup_n_f32(0);
#endif
float32_t sum=0.0f;
float32_t sum0=0.0f,sum1=0.0f,sum2=0.0f,sum3=0.0f;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ;i < n ; i++)
{
for(j=i ; j+3 < n ; j+=4)
{
pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
acc0 = vdupq_n_f32(0.0f);
acc1 = vdupq_n_f32(0.0f);
acc2 = vdupq_n_f32(0.0f);
acc3 = vdupq_n_f32(0.0f);
kCnt = i >> 2;
k=0;
while(kCnt > 0)
{
vecGi=vld1q_f32(&pG[i * n + k]);
vecGj0=vld1q_f32(&pG[(j + 0) * n + k]);
vecGj1=vld1q_f32(&pG[(j + 1) * n + k]);
vecGj2=vld1q_f32(&pG[(j + 2) * n + k]);
vecGj3=vld1q_f32(&pG[(j + 3) * n + k]);
acc0 = vfmaq_f32(acc0, vecGi, vecGj0);
acc1 = vfmaq_f32(acc1, vecGi, vecGj1);
acc2 = vfmaq_f32(acc2, vecGi, vecGj2);
acc3 = vfmaq_f32(acc3, vecGi, vecGj3);
kCnt--;
k+=4;
}
#if defined(__aarch64__)
sum0 = vpadds_f32(vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)));
sum1 = vpadds_f32(vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1)));
sum2 = vpadds_f32(vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2)));
sum3 = vpadds_f32(vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3)));
#else
tmp = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
sum0 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
tmp = vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
sum1 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
tmp = vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2));
sum2 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
tmp = vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3));
sum3 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
#endif
kCnt = i & 3;
while(kCnt > 0)
{
sum0 = sum0 + pG[i * n + k] * pG[(j + 0) * n + k];
sum1 = sum1 + pG[i * n + k] * pG[(j + 1) * n + k];
sum2 = sum2 + pG[i * n + k] * pG[(j + 2) * n + k];
sum3 = sum3 + pG[i * n + k] * pG[(j + 3) * n + k];
kCnt--;
k++;
}
pG[(j + 0) * n + i] -= sum0;
pG[(j + 1) * n + i] -= sum1;
pG[(j + 2) * n + i] -= sum2;
pG[(j + 3) * n + i] -= sum3;
}
for(; j < n ; j++)
{
pG[j * n + i] = pA[j * n + i];
acc = vdupq_n_f32(0.0f);
kCnt = i >> 2;
k=0;
while(kCnt > 0)
{
vecGi=vld1q_f32(&pG[i * n + k]);
vecGj=vld1q_f32(&pG[j * n + k]);
acc = vfmaq_f32(acc, vecGi, vecGj);
kCnt--;
k+=4;
}
#if defined(__aarch64__)
sum = vpadds_f32(vpadd_f32(vget_low_f32(acc), vget_high_f32(acc)));
#else
tmp = vpadd_f32(vget_low_f32(acc), vget_high_f32(acc));
sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
#endif
kCnt = i & 3;
while(kCnt > 0)
{
sum = sum + pG[i * n + k] * pG[(j + 0) * n + k];
kCnt--;
k++;
}
pG[j * n + i] -= sum;
}
if (pG[i * n + i] <= 0.0f)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
for(j=i; j < n ; j++)
{
pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_cholesky_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k;
int n = pSrc->numRows;
float32_t invSqrtVj;
float32_t *pA,*pG;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ; i < n ; i++)
{
for(j=i ; j < n ; j++)
{
pG[j * n + i] = pA[j * n + i];
for(k=0; k < i ; k++)
{
pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
}
}
if (pG[i * n + i] <= 0.0f)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
for(j=i ; j < n ; j++)
{
pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixChol group
*/

View File

@@ -0,0 +1,122 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cholesky_f64.c
* Description: Floating-point Cholesky decomposition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixChol
@{
*/
/**
* @brief Floating-point Cholesky decomposition of positive-definite matrix.
* @param[in] pSrc points to the instance of the input floating-point matrix structure.
* @param[out] pDst points to the instance of the output floating-point matrix structure.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* @return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
* @par
* If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
* The decomposition of A is returning a lower triangular matrix U such that A = U U^t
*/
arm_status arm_mat_cholesky_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k;
int n = pSrc->numRows;
float64_t invSqrtVj;
float64_t *pA,*pG;
pA = pSrc->pData;
pG = pDst->pData;
for(i=0 ; i < n ; i++)
{
for(j=i ; j < n ; j++)
{
pG[j * n + i] = pA[j * n + i];
for(k=0; k < i ; k++)
{
pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
}
}
if (pG[i * n + i] <= 0.0)
{
return(ARM_MATH_DECOMPOSITION_FAILURE);
}
invSqrtVj = 1.0/sqrt(pG[i * n + i]);
for(j=i ; j < n ; j++)
{
pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixChol group
*/

View File

@@ -0,0 +1,935 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cmplx_mult_f16.c
* Description: Floating-point matrix multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup CmplxMatrixMult
@{
*/
/**
@brief Floating-point Complex matrix multiplication.
@param[in] pSrcA points to first input complex matrix structure
@param[in] pSrcB points to second input complex matrix structure
@param[out] pDst points to output complex matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
#pragma GCC warning "Scalar version of arm_mat_cmplx_mult_f16 built. Helium version has build issues with gcc."
#endif
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
#include "arm_helium_utils.h"
#define DONTCARE 0 /* inactive lane content */
__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_2x2_mve(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
#define MATRIX_DIM 2
float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs0,vecColAOffs0,vecColAOffs1;
float16_t *pInA0 = pInA;
f16x8_t acc0, acc1;
f16x8_t vecB, vecA0, vecA1;
f16x8_t vecTmp;
uint16_t tmp;
static const uint16_t offsetB0[8] = { 0, 1,
MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
2, 3,
MATRIX_DIM * CMPLX_DIM + 2 , MATRIX_DIM * CMPLX_DIM + 3,
};
vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
tmp = 0;
vecColAOffs0 = viwdupq_u16(tmp, 4, 1);
tmp = (CMPLX_DIM * MATRIX_DIM);
vecColAOffs1 = vecColAOffs0 + (uint16_t)(CMPLX_DIM * MATRIX_DIM);
pInB = (float16_t const *)pSrcB->pData;
vecA0 = vldrhq_gather_shifted_offset_f16(pInA0, vecColAOffs0);
vecA1 = vldrhq_gather_shifted_offset_f16(pInA0, vecColAOffs1);
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
acc0 = vcmulq(vecA0, vecB);
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
acc1 = vcmulq(vecA1, vecB);
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
/*
* Compute
* re0+re1 | im0+im1 | re0+re1 | im0+im1
* re2+re3 | im2+im3 | re2+re3 | im2+im3
*/
vecTmp = (f16x8_t) vrev64q_s32((int32x4_t) acc0);
vecTmp = vaddq(vecTmp, acc0);
*(float32_t *)(&pOut[0 * CMPLX_DIM * MATRIX_DIM]) = ((f32x4_t)vecTmp)[0];
*(float32_t *)(&pOut[0 * CMPLX_DIM * MATRIX_DIM + CMPLX_DIM]) = ((f32x4_t)vecTmp)[2];
vecTmp = (f16x8_t) vrev64q_s32((int32x4_t) acc1);
vecTmp = vaddq(vecTmp, acc1);
*(float32_t *)(&pOut[1 * CMPLX_DIM * MATRIX_DIM]) = ((f32x4_t)vecTmp)[0];
*(float32_t *)(&pOut[1 * CMPLX_DIM * MATRIX_DIM + CMPLX_DIM]) = ((f32x4_t)vecTmp)[2];
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
#undef MATRIX_DIM
}
__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_3x3_mve(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
#define MATRIX_DIM 3
float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs0;
float16_t *pInA0 = pInA;
float16_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM;
float16_t *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM;
f16x8_t acc0, acc1, acc2;
f16x8_t vecB, vecA0, vecA1, vecA2;
static const uint16_t offsetB0[8] = { 0, 1,
MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
2 * MATRIX_DIM * CMPLX_DIM, 2 * MATRIX_DIM * CMPLX_DIM + 1,
DONTCARE, DONTCARE
};
/* enable predication to disable upper half complex vector element */
mve_pred16_t p0 = vctp16q(MATRIX_DIM * CMPLX_DIM);
vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
pInB = (float16_t const *)pSrcB->pData;
vecA0 = vldrhq_f16(pInA0);
vecA1 = vldrhq_f16(pInA1);
vecA2 = vldrhq_f16(pInA2);
vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
acc0 = vcmulq(vecA0, vecB);
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
acc1 = vcmulq(vecA1, vecB);
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
acc2 = vcmulq(vecA2, vecB);
acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
acc0 = vcmulq(vecA0, vecB);
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
acc1 = vcmulq(vecA1, vecB);
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
acc2 = vcmulq(vecA2, vecB);
acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
acc0 = vcmulq(vecA0, vecB);
acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
acc1 = vcmulq(vecA1, vecB);
acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
acc2 = vcmulq(vecA2, vecB);
acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
#undef MATRIX_DIM
}
__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_4x4_mve(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
#define MATRIX_DIM 4
float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs0;
float16_t *pInA0 = pInA;
float16_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM;
float16_t *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM;
float16_t *pInA3 = pInA2 + CMPLX_DIM * MATRIX_DIM;
f16x8_t acc0, acc1, acc2, acc3;
f16x8_t vecB, vecA;
static const uint16_t offsetB0[8] = { 0, 1,
MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
2 * MATRIX_DIM * CMPLX_DIM, 2 * MATRIX_DIM * CMPLX_DIM + 1,
3 * MATRIX_DIM * CMPLX_DIM, 3 * MATRIX_DIM * CMPLX_DIM + 1
};
vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
pInB = (float16_t const *)pSrcB->pData;
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrhq_f16(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrhq_f16(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrhq_f16(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrhq_f16(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrhq_f16(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrhq_f16(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrhq_f16(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrhq_f16(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrhq_f16(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrhq_f16(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrhq_f16(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrhq_f16(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrhq_f16(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrhq_f16(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrhq_f16(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrhq_f16(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
#undef MATRIX_DIM
}
arm_status arm_mat_cmplx_mult_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t const *pInB = (float16_t const *) pSrcB->pData; /* input data matrix pointer B */
float16_t const *pInA = (float16_t const *) pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t col, i = 0U, row = numRowsA; /* loop counters */
arm_status status; /* status of matrix multiplication */
uint16x8_t vecOffs, vecColBOffs;
uint32_t blkCnt,rowCnt; /* loop counters */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* small squared matrix specialized routines
*/
if (numRowsA == numColsB && numColsB == numColsA)
{
if (numRowsA == 1)
{
pOut[0] = (_Float16)pInA[0] * (_Float16)pInB[0] - (_Float16)pInA[1] * (_Float16)pInB[1];
pOut[1] = (_Float16)pInA[0] * (_Float16)pInB[1] + (_Float16)pInA[1] * (_Float16)pInB[0];
return (ARM_MATH_SUCCESS);
}
else if (numRowsA == 2)
return arm_mat_cmplx_mult_f16_2x2_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 3)
return arm_mat_cmplx_mult_f16_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_cmplx_mult_f16_4x4_mve(pSrcA, pSrcB, pDst);
}
vecColBOffs[0] = 0;
vecColBOffs[1] = 1;
vecColBOffs[2] = numColsB * CMPLX_DIM;
vecColBOffs[3] = (numColsB * CMPLX_DIM) + 1;
vecColBOffs[4] = 2*numColsB * CMPLX_DIM;
vecColBOffs[5] = 2*(numColsB * CMPLX_DIM) + 1;
vecColBOffs[6] = 3*numColsB * CMPLX_DIM;
vecColBOffs[7] = 3*(numColsB * CMPLX_DIM) + 1;
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
rowCnt = row >> 2;
while (rowCnt > 0u)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i * CMPLX_DIM;
i = i + 4 * numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (float16_t const *) pSrcB->pData;
/*
* column loop
*/
while (col > 0u)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
float16_t const *pInA0 = pInA;
float16_t const *pInA1 = pInA0 + numColsA * CMPLX_DIM;
float16_t const *pInA2 = pInA1 + numColsA * CMPLX_DIM;
float16_t const *pInA3 = pInA2 + numColsA * CMPLX_DIM;
f16x8_t acc0, acc1, acc2, acc3;
acc0 = vdupq_n_f16(0.0f16);
acc1 = vdupq_n_f16(0.0f16);
acc2 = vdupq_n_f16(0.0f16);
acc3 = vdupq_n_f16(0.0f16);
pSrcA0Vec = (float16_t const *) pInA0;
pSrcA1Vec = (float16_t const *) pInA1;
pSrcA2Vec = (float16_t const *) pInA2;
pSrcA3Vec = (float16_t const *) pInA3;
vecOffs = vecColBOffs;
/*
* process 1 x 4 block output
*/
blkCnt = (numColsA * CMPLX_DIM) >> 3;
while (blkCnt > 0U)
{
f16x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset_f16(pInB, vecOffs);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs , (uint16_t) (numColsB * 4 * CMPLX_DIM));
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 8;
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 8;
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 8;
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 8;
acc3 = vcmlaq(acc3, vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
blkCnt--;
}
/*
* Unsupported addressing mode compiler crash
*/
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA * CMPLX_DIM) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
f16x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset_z_f16(pInB, vecOffs, p0);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
vecA = vld1q(pSrcA0Vec);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec);
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec);
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec);
acc3 = vcmlaq(acc3, vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
}
mve_cmplx_sum_intra_vec_f16(acc0, &px[0 * CMPLX_DIM * numColsB + 0]);
mve_cmplx_sum_intra_vec_f16(acc1, &px[1 * CMPLX_DIM * numColsB + 0]);
mve_cmplx_sum_intra_vec_f16(acc2, &px[2 * CMPLX_DIM * numColsB + 0]);
mve_cmplx_sum_intra_vec_f16(acc3, &px[3 * CMPLX_DIM * numColsB + 0]);
px += CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (float16_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += (numColsA * 4) * CMPLX_DIM;
/*
* Decrement the row loop counter
*/
rowCnt --;
}
rowCnt = row & 3;
while (rowCnt > 0u)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i * CMPLX_DIM;
i = i + numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (float16_t const *) pSrcB->pData;
/*
* column loop
*/
while (col > 0u)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
float16_t const *pSrcA0Vec;
float16_t const *pInA0 = pInA;
f16x8_t acc0;
acc0 = vdupq_n_f16(0.0f16);
pSrcA0Vec = (float16_t const *) pInA0;
vecOffs = vecColBOffs;
/*
* process 1 x 4 block output
*/
blkCnt = (numColsA * CMPLX_DIM) >> 3;
while (blkCnt > 0U)
{
f16x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (4*numColsB * CMPLX_DIM));
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 8;
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA * CMPLX_DIM) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
f16x8_t vecB, vecA;
vecB = vldrhq_gather_shifted_offset_z(pInB, vecOffs, p0);
vecA = vld1q(pSrcA0Vec);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
}
mve_cmplx_sum_intra_vec_f16(acc0, &px[0]);
px += CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (float16_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += numColsA * CMPLX_DIM;
rowCnt--;
}
/*
* set status as ARM_MATH_SUCCESS
*/
status = ARM_MATH_SUCCESS;
}
/*
* Return to application
*/
return (status);
}
#else
arm_status arm_mat_cmplx_mult_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float16_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float16_t *pOut = pDst->pData; /* Output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
_Float16 sumReal, sumImag; /* Accumulator */
_Float16 a1, b1, c1, d1;
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#if defined (ARM_MATH_LOOPUNROLL)
_Float16 a0, b0, c0, d0;
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + 2 * i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal = 0.0f16;
sumImag = 0.0f16;
/* Initiate pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* Reading real part of complex matrix A */
a0 = *pIn1;
/* Reading real part of complex matrix B */
c0 = *pIn2;
/* Reading imaginary part of complex matrix A */
b0 = *(pIn1 + 1U);
/* Reading imaginary part of complex matrix B */
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a0 * c0;
sumImag += b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b0 * d0;
sumImag += a0 * d0;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
a0 = *(pIn1 );
c0 = *(pIn2 );
b0 = *(pIn1 + 1U);
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a0 * c0;
sumImag += b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b0 * d0;
sumImag += a0 * d0;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
/* Decrement loop count */
colCnt--;
}
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
#else
/* Initialize blkCnt with number of samples */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sumReal;
*px++ = sumImag;
/* Update pointer pIn2 to point to starting address of next column */
j++;
pIn2 = pSrcB->pData + 2U * j;
/* Decrement column loop counter */
col--;
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + 2 * numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixMult group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_cmplx_mult_f32.c
* Description: Floating-point matrix multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,39 +26,816 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @defgroup CmplxMatrixMult Complex Matrix Multiplication
*
* Complex Matrix multiplication is only defined if the number of columns of the
* first matrix equals the number of rows of the second matrix.
* Multiplying an <code>M x N</code> matrix with an <code>N x P</code> matrix results
* in an <code>M x P</code> matrix.
* When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
* <code>pSrcA</code> and <code>pSrcB</code> are equal; and (2) that the size of the output
* matrix equals the outer dimensions of <code>pSrcA</code> and <code>pSrcB</code>.
@defgroup CmplxMatrixMult Complex Matrix Multiplication
Complex Matrix multiplication is only defined if the number of columns of the
first matrix equals the number of rows of the second matrix.
Multiplying an <code>M x N</code> matrix with an <code>N x P</code> matrix results
in an <code>M x P</code> matrix.
@par
When matrix size checking is enabled, the functions check:
- that the inner dimensions of <code>pSrcA</code> and <code>pSrcB</code> are equal;
- that the size of the output matrix equals the outer dimensions of <code>pSrcA</code> and <code>pSrcB</code>.
*/
/**
* @addtogroup CmplxMatrixMult
* @{
@addtogroup CmplxMatrixMult
@{
*/
/**
* @brief Floating-point Complex matrix multiplication.
* @param[in] *pSrcA points to the first input complex matrix structure
* @param[in] *pSrcB points to the second input complex matrix structure
* @param[out] *pDst points to output complex matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@brief Floating-point Complex matrix multiplication.
@param[in] pSrcA points to first input complex matrix structure
@param[in] pSrcB points to second input complex matrix structure
@param[out] pDst points to output complex matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#define MATRIX_DIM2 2
#define MATRIX_DIM3 3
#define MATRIX_DIM4 4
__STATIC_INLINE arm_status arm_mat_cmplx_mult_f32_2x2_mve(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs0;
float32_t *pInA0 = pInA;
float32_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM2;
f32x4_t acc0, acc1;
f32x4_t vecB, vecA;
static const uint32_t offsetB0[4] = { 0, 1,
MATRIX_DIM2 * CMPLX_DIM, MATRIX_DIM2 * CMPLX_DIM + 1
};
vecColBOffs0 = vldrwq_u32((uint32_t const *) offsetB0);
pInB = (float32_t const *)pSrcB->pData;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrwq_f32(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 0] = acc0[0] + acc0[2];
pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 1] = acc0[1] + acc0[3];
pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 0] = acc1[0] + acc1[2];
pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 1] = acc1[1] + acc1[3];
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrwq_f32(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 0] = acc0[0] + acc0[2];
pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 1] = acc0[1] + acc0[3];
pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 0] = acc1[0] + acc1[2];
pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 1] = acc1[1] + acc1[3];
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_cmplx_mult_f32_3x3_mve(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs0, vecColBOffs1;
float32_t *pInA0 = pInA;
float32_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM3;
float32_t *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM3;
f32x4_t acc0, acc1, acc2;
f32x4_t vecB, vecA;
/* enable predication to disable upper half complex vector element */
mve_pred16_t p0 = vctp32q(CMPLX_DIM);
static const uint32_t offsetB0[4] = { 0, 1,
MATRIX_DIM3 * CMPLX_DIM, MATRIX_DIM3 * CMPLX_DIM + 1
};
static const uint32_t offsetB1[4] = { 2 * MATRIX_DIM3 * CMPLX_DIM, 2 * MATRIX_DIM3 * CMPLX_DIM + 1,
INACTIVELANE, INACTIVELANE
};
vecColBOffs0 = vldrwq_u32((uint32_t const *) offsetB0);
vecColBOffs1 = vldrwq_u32((uint32_t const *) offsetB1);
pInB = (float32_t const *)pSrcB->pData;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrwq_f32(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrwq_f32(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecB = vldrwq_gather_shifted_offset_z(pInB, vecColBOffs1, p0);
vecA = vldrwq_f32(&pInA0[4]);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(&pInA1[4]);
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrwq_f32(&pInA2[4]);
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc0[0] + acc0[2];
pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc0[1] + acc0[3];
pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc1[0] + acc1[2];
pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc1[1] + acc1[3];
pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc2[0] + acc2[2];
pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc2[1] + acc2[3];
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrwq_f32(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrwq_f32(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecB = vldrwq_gather_shifted_offset_z(pInB, vecColBOffs1, p0);
vecA = vldrwq_f32(&pInA0[4]);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(&pInA1[4]);
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrwq_f32(&pInA2[4]);
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc0[0] + acc0[2];
pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc0[1] + acc0[3];
pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc1[0] + acc1[2];
pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc1[1] + acc1[3];
pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc2[0] + acc2[2];
pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc2[1] + acc2[3];
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrwq_f32(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrwq_f32(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecB = vldrwq_gather_shifted_offset_z(pInB, vecColBOffs1, p0);
vecA = vldrwq_f32(&pInA0[4]);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(&pInA1[4]);
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrwq_f32(&pInA2[4]);
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc0[0] + acc0[2];
pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc0[1] + acc0[3];
pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc1[0] + acc1[2];
pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc1[1] + acc1[3];
pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc2[0] + acc2[2];
pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc2[1] + acc2[3];
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_cmplx_mult_f32_4x4_mve(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs0, vecColBOffs1;
float32_t *pInA0 = pInA;
float32_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM4;
float32_t *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM4;
float32_t *pInA3 = pInA2 + CMPLX_DIM * MATRIX_DIM4;
f32x4_t acc0, acc1, acc2, acc3;
f32x4_t vecB, vecA;
static const uint32_t offsetB0[4] = { 0, 1,
MATRIX_DIM4 * CMPLX_DIM, MATRIX_DIM4 * CMPLX_DIM + 1
};
static const uint32_t offsetB1[4] = { 2 * MATRIX_DIM4 * CMPLX_DIM, 2 * MATRIX_DIM4 * CMPLX_DIM + 1,
3 * MATRIX_DIM4 * CMPLX_DIM, 3 * MATRIX_DIM4 * CMPLX_DIM + 1
};
vecColBOffs0 = vldrwq_u32((uint32_t const *) offsetB0);
vecColBOffs1 = vldrwq_u32((uint32_t const *) offsetB1);
pInB = (float32_t const *)pSrcB->pData;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrwq_f32(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrwq_f32(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrwq_f32(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
vecA = vldrwq_f32(&pInA0[4]);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(&pInA1[4]);
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrwq_f32(&pInA2[4]);
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrwq_f32(&pInA3[4]);
acc3 = vcmlaq(acc3, vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc0[0] + acc0[2];
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc0[1] + acc0[3];
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc1[0] + acc1[2];
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc1[1] + acc1[3];
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc2[0] + acc2[2];
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc2[1] + acc2[3];
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc3[0] + acc3[2];
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc3[1] + acc3[3];
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrwq_f32(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrwq_f32(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrwq_f32(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
vecA = vldrwq_f32(&pInA0[4]);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(&pInA1[4]);
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrwq_f32(&pInA2[4]);
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrwq_f32(&pInA3[4]);
acc3 = vcmlaq(acc3, vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc0[0] + acc0[2];
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc0[1] + acc0[3];
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc1[0] + acc1[2];
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc1[1] + acc1[3];
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc2[0] + acc2[2];
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc2[1] + acc2[3];
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc3[0] + acc3[2];
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc3[1] + acc3[3];
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrwq_f32(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrwq_f32(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrwq_f32(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
vecA = vldrwq_f32(&pInA0[4]);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(&pInA1[4]);
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrwq_f32(&pInA2[4]);
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrwq_f32(&pInA3[4]);
acc3 = vcmlaq(acc3, vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc0[0] + acc0[2];
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc0[1] + acc0[3];
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc1[0] + acc1[2];
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc1[1] + acc1[3];
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc2[0] + acc2[2];
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc2[1] + acc2[3];
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc3[0] + acc3[2];
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc3[1] + acc3[3];
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrwq_f32(pInA0);
acc0 = vcmulq(vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(pInA1);
acc1 = vcmulq(vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrwq_f32(pInA2);
acc2 = vcmulq(vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrwq_f32(pInA3);
acc3 = vcmulq(vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
vecA = vldrwq_f32(&pInA0[4]);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vldrwq_f32(&pInA1[4]);
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vldrwq_f32(&pInA2[4]);
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vldrwq_f32(&pInA3[4]);
acc3 = vcmlaq(acc3, vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc0[0] + acc0[2];
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc0[1] + acc0[3];
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc1[0] + acc1[2];
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc1[1] + acc1[3];
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc2[0] + acc2[2];
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc2[1] + acc2[3];
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc3[0] + acc3[2];
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc3[1] + acc3[3];
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_cmplx_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t const *pInB = (float32_t const *) pSrcB->pData; /* input data matrix pointer B */
float32_t const *pInA = (float32_t const *) pSrcA->pData; /* input data matrix pointer A */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t col, i = 0U, row = numRowsA; /* loop counters */
arm_status status; /* status of matrix multiplication */
uint32x4_t vecOffs, vecColBOffs;
uint32_t blkCnt, rowCnt; /* loop counters */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* small squared matrix specialized routines
*/
if (numRowsA == numColsB && numColsB == numColsA)
{
if (numRowsA == 1)
{
pOut[0] = pInA[0] * pInB[0] - pInA[1] * pInB[1];
pOut[1] = pInA[0] * pInB[1] + pInA[1] * pInB[0];
return (ARM_MATH_SUCCESS);
}
else if (numRowsA == 2)
return arm_mat_cmplx_mult_f32_2x2_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 3)
return arm_mat_cmplx_mult_f32_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_cmplx_mult_f32_4x4_mve(pSrcA, pSrcB, pDst);
}
vecColBOffs[0] = 0;
vecColBOffs[1] = 1;
vecColBOffs[2] = numColsB * CMPLX_DIM;
vecColBOffs[3] = (numColsB * CMPLX_DIM) + 1;
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
rowCnt = row >> 2;
while (rowCnt > 0u)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i * CMPLX_DIM;
i = i + 4 * numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (float32_t const *) pSrcB->pData;
/*
* column loop
*/
while (col > 0u)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
float32_t const *pInA0 = pInA;
float32_t const *pInA1 = pInA0 + numColsA * CMPLX_DIM;
float32_t const *pInA2 = pInA1 + numColsA * CMPLX_DIM;
float32_t const *pInA3 = pInA2 + numColsA * CMPLX_DIM;
f32x4_t acc0, acc1, acc2, acc3;
acc0 = vdupq_n_f32(0.0f);
acc1 = vdupq_n_f32(0.0f);
acc2 = vdupq_n_f32(0.0f);
acc3 = vdupq_n_f32(0.0f);
pSrcA0Vec = (float32_t const *) pInA0;
pSrcA1Vec = (float32_t const *) pInA1;
pSrcA2Vec = (float32_t const *) pInA2;
pSrcA3Vec = (float32_t const *) pInA3;
vecOffs = vecColBOffs;
/*
* process 1 x 4 block output
*/
blkCnt = (numColsA * CMPLX_DIM) >> 2;
while (blkCnt > 0U)
{
f32x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vecOffs + (uint32_t) (numColsB * 2 * CMPLX_DIM);
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4;
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4;
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4;
acc3 = vcmlaq(acc3, vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA * CMPLX_DIM) & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
f32x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vecOffs + (uint32_t) (numColsB * 2 * CMPLX_DIM);
vecA = vld1q(pSrcA0Vec);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec);
acc1 = vcmlaq(acc1, vecA, vecB);
acc1 = vcmlaq_rot90(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec);
acc2 = vcmlaq(acc2, vecA, vecB);
acc2 = vcmlaq_rot90(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec);
acc3 = vcmlaq(acc3, vecA, vecB);
acc3 = vcmlaq_rot90(acc3, vecA, vecB);
}
px[0 * CMPLX_DIM * numColsB + 0] = acc0[0] + acc0[2];
px[0 * CMPLX_DIM * numColsB + 1] = acc0[1] + acc0[3];
px[1 * CMPLX_DIM * numColsB + 0] = acc1[0] + acc1[2];
px[1 * CMPLX_DIM * numColsB + 1] = acc1[1] + acc1[3];
px[2 * CMPLX_DIM * numColsB + 0] = acc2[0] + acc2[2];
px[2 * CMPLX_DIM * numColsB + 1] = acc2[1] + acc2[3];
px[3 * CMPLX_DIM * numColsB + 0] = acc3[0] + acc3[2];
px[3 * CMPLX_DIM * numColsB + 1] = acc3[1] + acc3[3];
px += CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (float32_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += (numColsA * 4) * CMPLX_DIM;
/*
* Decrement the row loop counter
*/
rowCnt --;
}
rowCnt = row & 3;
while (rowCnt > 0u)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i * CMPLX_DIM;
i = i + numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (float32_t const *) pSrcB->pData;
/*
* column loop
*/
while (col > 0u)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
float32_t const *pSrcA0Vec;
float32_t const *pInA0 = pInA;
f32x4_t acc0;
acc0 = vdupq_n_f32(0.0f);
pSrcA0Vec = (float32_t const *) pInA0;
vecOffs = vecColBOffs;
/*
* process 1 x 4 block output
*/
blkCnt = (numColsA * CMPLX_DIM) >> 2;
while (blkCnt > 0U)
{
f32x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vecOffs + (uint32_t) (numColsB * 2 * CMPLX_DIM);
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 4;
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA * CMPLX_DIM) & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
f32x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
vecA = vld1q(pSrcA0Vec);
acc0 = vcmlaq(acc0, vecA, vecB);
acc0 = vcmlaq_rot90(acc0, vecA, vecB);
}
px[0] = acc0[0] + acc0[2];
px[1] = acc0[1] + acc0[3];
px += CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (float32_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += numColsA * CMPLX_DIM;
rowCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON)
arm_status arm_mat_cmplx_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
@@ -73,15 +850,20 @@ arm_status arm_mat_cmplx_mult_f32(
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
float32_t sumReal1, sumImag1; /* accumulator */
float32_t a0, b0, c0, d0;
float32_t a1, b1, c1, d1;
float32_t a1, a1B,b1, b1B, c1, d1;
float32_t sumReal2, sumImag2; /* accumulator */
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32x4x2_t a0V, a1V;
float32x4_t accR0,accI0, accR1,accI1,tempR, tempI;
float32x2_t accum = vdup_n_f32(0);
float32_t *pIn1B = pSrcA->pData;
uint16_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
uint16_t col, i = 0U, j, rowCnt, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
float32_t sumReal1B, sumImag1B;
float32_t sumReal2B, sumImag2B;
float32_t *pxB;
#ifdef ARM_MATH_MATRIX_CHECK
@@ -99,8 +881,173 @@ arm_status arm_mat_cmplx_mult_f32(
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
rowCnt = row >> 1;
/* Row loop */
while (rowCnt > 0U)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + 2 * i;
pxB = px + 2 * numColsB;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* Column loop */
while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal1 = 0.0f;
sumImag1 = 0.0f;
sumReal1B = 0.0f;
sumImag1B = 0.0f;
sumReal2 = 0.0f;
sumImag2 = 0.0f;
sumReal2B = 0.0f;
sumImag2B = 0.0f;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
pIn1B = pIn1 + 2*numColsA;
accR0 = vdupq_n_f32(0.0);
accI0 = vdupq_n_f32(0.0);
accR1 = vdupq_n_f32(0.0);
accI1 = vdupq_n_f32(0.0);
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2;
/* Matrix multiplication */
while (colCnt > 0U)
{
/* Reading real part of complex matrix A */
a0V = vld2q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
a1V = vld2q_f32(pIn1B); // load & separate real/imag pSrcA (de-interleave 2)
pIn1 += 8;
pIn1B += 8;
tempR = vsetq_lane_f32(*pIn2,tempR,0);
tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,0);
pIn2 += 2 * numColsB;
tempR = vsetq_lane_f32(*pIn2,tempR,1);
tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,1);
pIn2 += 2 * numColsB;
tempR = vsetq_lane_f32(*pIn2,tempR,2);
tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,2);
pIn2 += 2 * numColsB;
tempR = vsetq_lane_f32(*pIn2,tempR,3);
tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,3);
pIn2 += 2 * numColsB;
accR0 = vmlaq_f32(accR0,a0V.val[0],tempR);
accR0 = vmlsq_f32(accR0,a0V.val[1],tempI);
accI0 = vmlaq_f32(accI0,a0V.val[1],tempR);
accI0 = vmlaq_f32(accI0,a0V.val[0],tempI);
accR1 = vmlaq_f32(accR1,a1V.val[0],tempR);
accR1 = vmlsq_f32(accR1,a1V.val[1],tempI);
accI1 = vmlaq_f32(accI1,a1V.val[1],tempR);
accI1 = vmlaq_f32(accI1,a1V.val[0],tempI);
/* Decrement the loop count */
colCnt--;
}
accum = vpadd_f32(vget_low_f32(accR0), vget_high_f32(accR0));
sumReal1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(accI0), vget_high_f32(accI0));
sumImag1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(accR1), vget_high_f32(accR1));
sumReal1B += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(accI1), vget_high_f32(accI1));
sumImag1B += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA & 3;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a1 = *pIn1;
a1B = *pIn1B;
c1 = *pIn2;
b1 = *(pIn1 + 1U);
b1B = *(pIn1B + 1U);
d1 = *(pIn2 + 1U);
sumReal1 += a1 * c1;
sumImag1 += b1 * c1;
sumReal1B += a1B * c1;
sumImag1B += b1B * c1;
pIn1 += 2U;
pIn1B += 2U;
pIn2 += 2 * numColsB;
sumReal2 -= b1 * d1;
sumImag2 += a1 * d1;
sumReal2B -= b1B * d1;
sumImag2B += a1B * d1;
/* Decrement the loop counter */
colCnt--;
}
sumReal1 += sumReal2;
sumImag1 += sumImag2;
sumReal1B += sumReal2B;
sumImag1B += sumImag2B;
/* Store the result in the destination buffer */
*px++ = sumReal1;
*px++ = sumImag1;
*pxB++ = sumReal1B;
*pxB++ = sumImag1B;
/* Update the pointer pIn2 to point to the starting address of the next column */
j++;
pIn2 = pSrcB->pData + 2U * j;
/* Decrement the column loop counter */
col--;
}
/* Update the pointer pInA to point to the starting address of the next 2 row */
i = i + 2*numColsB;
pInA = pInA + 4 * numColsA;
/* Decrement the row loop counter */
rowCnt--;
}
rowCnt = row & 1;
while (rowCnt > 0U)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + 2 * i;
@@ -114,8 +1061,8 @@ arm_status arm_mat_cmplx_mult_f32(
j = 0U;
/* column loop */
do
/* Column loop */
while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal1 = 0.0f;
@@ -127,94 +1074,58 @@ arm_status arm_mat_cmplx_mult_f32(
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
accR0 = vdupq_n_f32(0.0);
accI0 = vdupq_n_f32(0.0);
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2;
/* matrix multiplication */
/* Matrix multiplication */
while (colCnt > 0U)
{
/* Reading real part of complex matrix A */
a0 = *pIn1;
a0V = vld2q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
pIn1 += 8;
/* Reading real part of complex matrix B */
c0 = *pIn2;
/* Reading imaginary part of complex matrix A */
b0 = *(pIn1 + 1U);
/* Reading imaginary part of complex matrix B */
d0 = *(pIn2 + 1U);
sumReal1 += a0 * c0;
sumImag1 += b0 * c0;
pIn1 += 2U;
tempR = vsetq_lane_f32(*pIn2,tempR,0);
tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,0);
pIn2 += 2 * numColsB;
sumReal2 -= b0 * d0;
sumImag2 += a0 * d0;
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
a1 = *pIn1;
c1 = *pIn2;
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
sumReal1 += a1 * c1;
sumImag1 += b1 * c1;
pIn1 += 2U;
tempR = vsetq_lane_f32(*pIn2,tempR,1);
tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,1);
pIn2 += 2 * numColsB;
sumReal2 -= b1 * d1;
sumImag2 += a1 * d1;
a0 = *pIn1;
c0 = *pIn2;
b0 = *(pIn1 + 1U);
d0 = *(pIn2 + 1U);
sumReal1 += a0 * c0;
sumImag1 += b0 * c0;
pIn1 += 2U;
tempR = vsetq_lane_f32(*pIn2,tempR,2);
tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,2);
pIn2 += 2 * numColsB;
sumReal2 -= b0 * d0;
sumImag2 += a0 * d0;
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
a1 = *pIn1;
c1 = *pIn2;
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
sumReal1 += a1 * c1;
sumImag1 += b1 * c1;
pIn1 += 2U;
tempR = vsetq_lane_f32(*pIn2,tempR,3);
tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,3);
pIn2 += 2 * numColsB;
sumReal2 -= b1 * d1;
sumImag2 += a1 * d1;
accR0 = vmlaq_f32(accR0,a0V.val[0],tempR);
accR0 = vmlsq_f32(accR0,a0V.val[1],tempI);
accI0 = vmlaq_f32(accI0,a0V.val[1],tempR);
accI0 = vmlaq_f32(accI0,a0V.val[0],tempI);
/* Decrement the loop count */
colCnt--;
}
accum = vpadd_f32(vget_low_f32(accR0), vget_high_f32(accR0));
sumReal1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(accI0), vget_high_f32(accI0));
sumImag1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
colCnt = numColsA & 3;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a1 = *pIn1;
c1 = *pIn2;
@@ -248,13 +1159,234 @@ arm_status arm_mat_cmplx_mult_f32(
/* Decrement the column loop counter */
col--;
} while (col > 0U);
}
/* Update the pointer pInA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + 2 * numColsA;
/* Decrement the row loop counter */
rowCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_cmplx_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float32_t *pOut = pDst->pData; /* Output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
float32_t sumReal, sumImag; /* Accumulator */
float32_t a1, b1, c1, d1;
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#if defined (ARM_MATH_LOOPUNROLL)
float32_t a0, b0, c0, d0;
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + 2 * i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal = 0.0f;
sumImag = 0.0f;
/* Initiate pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* Reading real part of complex matrix A */
a0 = *pIn1;
/* Reading real part of complex matrix B */
c0 = *pIn2;
/* Reading imaginary part of complex matrix A */
b0 = *(pIn1 + 1U);
/* Reading imaginary part of complex matrix B */
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a0 * c0;
sumImag += b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b0 * d0;
sumImag += a0 * d0;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
a0 = *(pIn1 );
c0 = *(pIn2 );
b0 = *(pIn1 + 1U);
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a0 * c0;
sumImag += b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b0 * d0;
sumImag += a0 * d0;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
/* Decrement loop count */
colCnt--;
}
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
#else
/* Initialize blkCnt with number of samples */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal += a1 * c1;
sumImag += b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal -= b1 * d1;
sumImag += a1 * d1;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sumReal;
*px++ = sumImag;
/* Update pointer pIn2 to point to starting address of next column */
j++;
pIn2 = pSrcB->pData + 2U * j;
/* Decrement column loop counter */
col--;
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + 2 * numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
@@ -267,6 +1399,9 @@ arm_status arm_mat_cmplx_mult_f32(
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixMult group
@} end of MatrixMult group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_cmplx_mat_mult_q15.c
* Description: Q15 complex matrix multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,144 +26,372 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup CmplxMatrixMult
* @{
@addtogroup CmplxMatrixMult
@{
*/
/**
* @brief Q15 Complex matrix multiplication
* @param[in] *pSrcA points to the first input complex matrix structure
* @param[in] *pSrcB points to the second input complex matrix structure
* @param[out] *pDst points to output complex matrix structure
* @param[in] *pScratch points to the array for storing intermediate results
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* \par Conditions for optimum performance
* Input, output and state buffers should be aligned by 32-bit
*
* \par Restrictions
* If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
* In this case input, output, scratch buffers should be aligned by 32-bit
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 64-bit internal accumulator. The inputs to the
* multiplications are in 1.15 format and multiplications yield a 2.30 result.
* The 2.30 intermediate
* results are accumulated in a 64-bit accumulator in 34.30 format. This approach
* provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
* truncated to 34.15 format by discarding the low 15 bits and then saturated to
* 1.15 format.
*
* \par
* Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function.
*
@brief Q15 Complex matrix multiplication.
@param[in] pSrcA points to first input complex matrix structure
@param[in] pSrcB points to second input complex matrix structure
@param[out] pDst points to output complex matrix structure
@param[in] pScratch points to an array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Conditions for optimum performance
Input, output and state buffers should be aligned by 32-bit
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator. The inputs to the
multiplications are in 1.15 format and multiplications yield a 2.30 result.
The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
This approach provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
arm_status arm_mat_cmplx_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pScratch)
arm_matrix_instance_q15 * pDst,
q15_t * pScratch)
{
/* accumulator */
q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint16_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
q63_t sumReal, sumImag;
#ifdef UNALIGNED_SUPPORT_DISABLE
q15_t in; /* Temporary variable to hold the input value */
q15_t a, b, c, d;
#else
q31_t in; /* Temporary variable to hold the input value */
q31_t prod1, prod2;
q31_t pSourceA, pSourceB;
#endif
q15_t const *pInA = (q15_t const *) pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t const *pInB = (q15_t const *) pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t const *pInB2;
q15_t *px; /* Temporary output data matrix pointer */
uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
uint32_t blkCnt; /* loop counters */
uint16x8_t vecOffs, vecColBOffs;
arm_status status; /* Status of matrix multiplication */
(void)pScratch;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
vecColBOffs[0] = 0;
vecColBOffs[1] = 1;
vecColBOffs[2] = numColsB * CMPLX_DIM;
vecColBOffs[3] = (numColsB * CMPLX_DIM) + 1;
vecColBOffs[4] = 2 * numColsB * CMPLX_DIM;
vecColBOffs[5] = 2 * (numColsB * CMPLX_DIM) + 1;
vecColBOffs[6] = 3 * numColsB * CMPLX_DIM;
vecColBOffs[7] = 3 * (numColsB * CMPLX_DIM) + 1;
/*
* Reset the variables for the usage in the following multiplication process
*/
i = 0;
row = numRowsA;
px = pDst->pData;
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
while (row > 0u)
{
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB >> 1;
j = 0;
/*
* column loop
*/
while (col > 0u)
{
q15_t const *pSrcAVec;
//, *pSrcBVec, *pSrcB2Vec;
q15x8_t vecA, vecB, vecB2;
q63_t acc0, acc1, acc2, acc3;
/*
* Initiate the pointer pIn1 to point to the starting address of the column being processed
*/
pInA = pSrcA->pData + i;
pInB = pSrcB->pData + j;
pInB2 = pInB + CMPLX_DIM;
j += 2 * CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Initiate the pointers
* - current Matrix A rows
* - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
*/
pSrcAVec = (q15_t const *) pInA;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
vecOffs = vecColBOffs;
blkCnt = (numColsA * CMPLX_DIM) >> 3;
while (blkCnt > 0U)
{
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
vecB2 = vldrhq_gather_shifted_offset(pInB2, vecOffs);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
acc2 = vmlsldavaq_s16(acc2, vecA, vecB2);
acc3 = vmlaldavaxq_s16(acc3, vecA, vecB2);
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA * CMPLX_DIM) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
vecA = vldrhq_z_s16(pSrcAVec, p0);
acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
vecB2 = vldrhq_gather_shifted_offset(pInB2, vecOffs);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
acc2 = vmlsldavaq_s16(acc2, vecA, vecB2);
acc3 = vmlaldavaxq_s16(acc3, vecA, vecB2);
}
/*
* Convert to 1.15, Store the results (1 x 2 block) in the destination buffer
*/
*px++ = (q15_t)MVE_ASRL_SAT16(acc0, 15);
*px++ = (q15_t)MVE_ASRL_SAT16(acc1, 15);
*px++ = (q15_t)MVE_ASRL_SAT16(acc2, 15);
*px++ = (q15_t)MVE_ASRL_SAT16(acc3, 15);
}
col = numColsB & 1;
/*
* column loop
*/
while (col > 0u)
{
q15_t const *pSrcAVec;
//, *pSrcBVec, *pSrcB2Vec;
q15x8_t vecA, vecB;
q63_t acc0, acc1;
/*
* Initiate the pointer pIn1 to point to the starting address of the column being processed
*/
pInA = pSrcA->pData + i;
pInB = pSrcB->pData + j;
j += CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Initiate the pointers
* - current Matrix A rows
* - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
*/
pSrcAVec = (q15_t const *) pInA;
acc0 = 0LL;
acc1 = 0LL;
vecOffs = vecColBOffs;
blkCnt = (numColsA * CMPLX_DIM) >> 3;
while (blkCnt > 0U)
{
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
/*
* move Matrix B read offsets, 4 rows down
*/
vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA * CMPLX_DIM) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
vecA = vldrhq_z_s16(pSrcAVec, p0);
acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
}
/*
* Convert to 1.15, Store the results (1 x 2 block) in the destination buffer
*/
*px++ = (q15_t)MVE_ASRL_SAT16(acc0, 15);
*px++ = (q15_t)MVE_ASRL_SAT16(acc1, 15);
}
i = i + numColsA * CMPLX_DIM;
/*
* Decrement the row loop counter
*/
row--;
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_cmplx_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pScratch)
{
q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
q63_t sumReal, sumImag; /* accumulator */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#if defined (ARM_MATH_DSP)
q31_t prod1, prod2;
q31_t pSourceA, pSourceB;
#else
q15_t a, b, c, d;
#endif /* #if defined (ARM_MATH_DSP) */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
#if defined (ARM_MATH_LOOPUNROLL)
/* Apply loop unrolling and exchange the columns with row elements */
col = numColsB >> 2;
/* The pointer px is set to starting address of the column being processed */
px = pSrcBT + i;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
#ifdef UNALIGNED_SUPPORT_DISABLE
/* Read two elements from the row */
in = *pInB++;
*px = in;
in = *pInB++;
px[1] = in;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from the row */
in = *pInB++;
*px = in;
in = *pInB++;
px[1] = in;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from the row */
in = *pInB++;
*px = in;
in = *pInB++;
px[1] = in;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Read two elements from the row */
in = *pInB++;
*px = in;
in = *pInB++;
px[1] = in;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
@@ -171,79 +399,33 @@ arm_status arm_mat_cmplx_mult_q15(
** No loop unrolling is used. */
col = numColsB % 0x4U;
while (col > 0U)
{
/* Read two elements from the row */
in = *pInB++;
*px = in;
in = *pInB++;
px[1] = in;
#else
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
/* Initialize blkCnt with number of samples */
col = numColsB;
*__SIMD32(px) = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB * 2;
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
*__SIMD32(px) = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB * 2;
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
*__SIMD32(px) = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB * 2;
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
*__SIMD32(px) = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB * 2;
/* Decrement the column loop counter */
col--;
}
/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
col = numColsB % 0x4U;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (col > 0U)
{
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
/* Read two elements from row */
write_q15x2 (px, read_q15x2_ia (&pInB));
*__SIMD32(px) = in;
#endif
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB * 2;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
i = i + 2U;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Reset the variables for the usage in the following multiplication process */
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
@@ -252,33 +434,61 @@ arm_status arm_mat_cmplx_mult_q15(
/* row loop */
do
{
/* For every row wise process, the column loop counter is to be initiated */
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the transposed pSrcB data */
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
/* Set variable sum, that acts as accumulator, to zero */
sumReal = 0;
sumImag = 0;
/* Apply loop unrolling and compute 2 MACs simultaneously. */
colCnt = numColsA >> 1;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i * 2;
/* Apply loop unrolling and compute 2 MACs simultaneously. */
colCnt = numColsA >> 1U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#ifdef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia (&pInA);
pSourceB = read_q15x2_ia (&pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia (&pInA);
pSourceB = read_q15x2_ia (&pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#else /* #if defined (ARM_MATH_DSP) */
/* read real and imag values from pSrcA buffer */
a = *pInA;
@@ -304,53 +514,40 @@ arm_status arm_mat_cmplx_mult_q15(
pInA += 4U;
/* Multiply and Accumlates */
sumReal += (q31_t) a *c;
sumImag += (q31_t) a *d;
sumReal -= (q31_t) b *d;
sumImag += (q31_t) b *c;
sumReal += (q31_t) a * c;
sumImag += (q31_t) a * d;
sumReal -= (q31_t) b * d;
sumImag += (q31_t) b * c;
/* update pointer */
pInB += 4U;
#else
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = *__SIMD32(pInA)++;
pSourceB = *__SIMD32(pInB)++;
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#endif /* #if defined (ARM_MATH_DSP) */
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = *__SIMD32(pInA)++;
pSourceB = *__SIMD32(pInB)++;
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
/* process odd column samples */
if ((numColsA & 0x1U) > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#ifdef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = read_q15x2_ia (&pInA);
pSourceB = read_q15x2_ia (&pInB);
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#else /* #if defined (ARM_MATH_DSP) */
/* read real and imag values from pSrcA and pSrcB buffer */
a = *pInA++;
@@ -359,55 +556,40 @@ arm_status arm_mat_cmplx_mult_q15(
d = *pInB++;
/* Multiply and Accumlates */
sumReal += (q31_t) a *c;
sumImag += (q31_t) a *d;
sumReal -= (q31_t) b *d;
sumImag += (q31_t) b *c;
sumReal += (q31_t) a * c;
sumImag += (q31_t) a * d;
sumReal -= (q31_t) b * d;
sumImag += (q31_t) b * c;
#else
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA = *__SIMD32(pInA)++;
pSourceB = *__SIMD32(pInB)++;
/* Multiply and Accumlates */
#ifdef ARM_MATH_BIG_ENDIAN
prod1 = -__SMUSD(pSourceA, pSourceB);
#else
prod1 = __SMUSD(pSourceA, pSourceB);
#endif
prod2 = __SMUADX(pSourceA, pSourceB);
sumReal += (q63_t) prod1;
sumImag += (q63_t) prod2;
#endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
#endif /* #if defined (ARM_MATH_DSP) */
}
/* Saturate and store the result in the destination buffer */
/* Saturate and store result in destination buffer */
*px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
*px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
} while (col > 0U);
i = i + numColsA;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixMult group
@} end of MatrixMult group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_cmplx_mult_q31.c
* Description: Floating-point matrix multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,66 +26,512 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup CmplxMatrixMult
* @{
@addtogroup CmplxMatrixMult
@{
*/
/**
* @brief Q31 Complex matrix multiplication
* @param[in] *pSrcA points to the first input complex matrix structure
* @param[in] *pSrcB points to the second input complex matrix structure
* @param[out] *pDst points to output complex matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using an internal 64-bit accumulator.
* The accumulator has a 2.62 format and maintains full precision of the intermediate
* multiplication results but provides only a single guard bit. There is no saturation
* on intermediate additions. Thus, if the accumulator overflows it wraps around and
* distorts the result. The input signals should be scaled down to avoid intermediate
* overflows. The input is thus scaled down by log2(numColsA) bits
* to avoid overflows, as a total of numColsA additions are performed internally.
* The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
*
*
@brief Q31 Complex matrix multiplication.
@param[in] pSrcA points to first input complex matrix structure
@param[in] pSrcB points to second input complex matrix structure
@param[out] pDst points to output complex matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The accumulator has a 2.62 format and maintains full precision of the intermediate
multiplication results but provides only a single guard bit. There is no saturation
on intermediate additions. Thus, if the accumulator overflows it wraps around and
distorts the result. The input signals should be scaled down to avoid intermediate
overflows. The input is thus scaled down by log2(numColsA) bits
to avoid overflows, as a total of numColsA additions are performed internally.
The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
#define MATRIX_DIM2 2
#define MATRIX_DIM3 3
#define MATRIX_DIM4 4
__STATIC_INLINE arm_status arm_mat_cmplx_mult_q31_2x2_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t const *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs0;
q31_t const *pInA0 = pInA;
q31_t const *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM2;
q63_t acc0, acc1, acc2, acc3;
q31x4_t vecB, vecA;
static const uint32_t offsetB0[4] = {
0, 1,
MATRIX_DIM2 * CMPLX_DIM, MATRIX_DIM2 * CMPLX_DIM + 1
};
vecColBOffs0 = vldrwq_u32(offsetB0);
pInB = (q31_t const *) pSrcB->pData;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrwq_s32(pInA0);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc2 = vmlsldavq_s32(vecA, vecB);
acc3 = vmlaldavxq_s32(vecA, vecB);
pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 0] = (q31_t) asrl(acc0, 31);
pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 1] = (q31_t) asrl(acc1, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 0] = (q31_t) asrl(acc2, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 1] = (q31_t) asrl(acc3, 31);
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecA = vldrwq_s32(pInA0);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc2 = vmlsldavq_s32(vecA, vecB);
acc3 = vmlaldavxq_s32(vecA, vecB);
pOut += CMPLX_DIM;
pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 0] = (q31_t) asrl(acc0, 31);
pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 1] = (q31_t) asrl(acc1, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 0] = (q31_t) asrl(acc2, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 1] = (q31_t) asrl(acc3, 31);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_cmplx_mult_q31_3x3_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t const *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs0, vecColBOffs1;
q31_t const *pInA0 = pInA;
q31_t const *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM3;
q31_t const *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM3;
q63_t acc0, acc1, acc2, acc3;
q31x4_t vecB, vecB1, vecA;
/*
* enable predication to disable upper half complex vector element
*/
mve_pred16_t p0 = vctp32q(CMPLX_DIM);
static const uint32_t offsetB0[4] = {
0, 1,
MATRIX_DIM3 * CMPLX_DIM, MATRIX_DIM3 * CMPLX_DIM + 1
};
static const uint32_t offsetB1[4] = {
2 * MATRIX_DIM3 * CMPLX_DIM, 2 * MATRIX_DIM3 * CMPLX_DIM + 1,
INACTIVELANE, INACTIVELANE
};
vecColBOffs0 = vldrwq_u32(offsetB0);
vecColBOffs1 = vldrwq_u32(offsetB1);
pInB = (q31_t const *) pSrcB->pData;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecB1 = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
vecA = vldrwq_s32(pInA0);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc2 = vmlsldavq_s32(vecA, vecB);
acc3 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_z_s32(&pInA0[4], p0);
acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
vecA = vldrwq_z_s32(&pInA1[4], p0);
acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc0, 31);
pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc1, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc2, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc3, 31);
vecA = vldrwq_s32(pInA2);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_z_s32(&pInA2[4], p0);
acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc0, 31);
pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc1, 31);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecB1 = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
vecA = vldrwq_s32(pInA0);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc2 = vmlsldavq_s32(vecA, vecB);
acc3 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_z_s32(&pInA0[4], p0);
acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
vecA = vldrwq_z_s32(&pInA1[4], p0);
acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc0, 31);
pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc1, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc2, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc3, 31);
vecA = vldrwq_s32(pInA2);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_z_s32(&pInA2[4], p0);
acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc0, 31);
pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc1, 31);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecB1 = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
vecA = vldrwq_s32(pInA0);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc2 = vmlsldavq_s32(vecA, vecB);
acc3 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_z_s32(&pInA0[4], p0);
acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
vecA = vldrwq_z_s32(&pInA1[4], p0);
acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc0, 31);
pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc1, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc2, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc3, 31);
vecA = vldrwq_s32(pInA2);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_z_s32(&pInA2[4], p0);
acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc0, 31);
pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc1, 31);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_cmplx_mult_q31_4x4_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t const *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs0, vecColBOffs1;
q31_t const *pInA0 = pInA;
q31_t const *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM4;
q31_t const *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM4;
q31_t const *pInA3 = pInA2 + CMPLX_DIM * MATRIX_DIM4;
q63_t acc0, acc1, acc2, acc3;
q31x4_t vecB, vecB1, vecA;
static const uint32_t offsetB0[4] = {
0, 1,
MATRIX_DIM4 * CMPLX_DIM, MATRIX_DIM4 * CMPLX_DIM + 1
};
static const uint32_t offsetB1[4] = {
2 * MATRIX_DIM4 * CMPLX_DIM, 2 * MATRIX_DIM4 * CMPLX_DIM + 1,
3 * MATRIX_DIM4 * CMPLX_DIM, 3 * MATRIX_DIM4 * CMPLX_DIM + 1
};
vecColBOffs0 = vldrwq_u32(offsetB0);
vecColBOffs1 = vldrwq_u32(offsetB1);
pInB = (q31_t const *) pSrcB->pData;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecB1 = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
vecA = vldrwq_s32(pInA0);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc2 = vmlsldavq_s32(vecA, vecB);
acc3 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(&pInA0[4]);
acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
vecA = vldrwq_s32(&pInA1[4]);
acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
vecA = vldrwq_s32(pInA2);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc2 = vmlsldavq_s32(vecA, vecB);
acc3 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(&pInA2[4]);
acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
vecA = vldrwq_s32(&pInA3[4]);
acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecB1 = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
vecA = vldrwq_s32(pInA0);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc2 = vmlsldavq_s32(vecA, vecB);
acc3 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(&pInA0[4]);
acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
vecA = vldrwq_s32(&pInA1[4]);
acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
vecA = vldrwq_s32(pInA2);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc2 = vmlsldavq_s32(vecA, vecB);
acc3 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(&pInA2[4]);
acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
vecA = vldrwq_s32(&pInA3[4]);
acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecB1 = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
vecA = vldrwq_s32(pInA0);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc2 = vmlsldavq_s32(vecA, vecB);
acc3 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(&pInA0[4]);
acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
vecA = vldrwq_s32(&pInA1[4]);
acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
vecA = vldrwq_s32(pInA2);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc2 = vmlsldavq_s32(vecA, vecB);
acc3 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(&pInA2[4]);
acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
vecA = vldrwq_s32(&pInA3[4]);
acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
pOut += CMPLX_DIM;
/*
* move to next B column
*/
pInB = pInB + CMPLX_DIM;
vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
vecB1 = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
vecA = vldrwq_s32(pInA0);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc2 = vmlsldavq_s32(vecA, vecB);
acc3 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(&pInA0[4]);
acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
vecA = vldrwq_s32(&pInA1[4]);
acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
vecA = vldrwq_s32(pInA2);
acc0 = vmlsldavq_s32(vecA, vecB);
acc1 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc2 = vmlsldavq_s32(vecA, vecB);
acc3 = vmlaldavxq_s32(vecA, vecB);
vecA = vldrwq_s32(&pInA2[4]);
acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
vecA = vldrwq_s32(&pInA3[4]);
acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_cmplx_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
q63_t sumReal1, sumImag1; /* accumulator */
q31_t a0, b0, c0, d0;
q31_t a1, b1, c1, d1;
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint16_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
q31_t const *pInB = (q31_t const *) pSrcB->pData; /* input data matrix pointer B */
q31_t const *pInA = (q31_t const *) pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t col, i = 0U, row = numRowsA; /* loop counters */
arm_status status; /* status of matrix multiplication */
uint32x4_t vecOffs, vecColBOffs;
uint32_t blkCnt, rowCnt; /* loop counters */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
@@ -98,6 +544,332 @@ arm_status arm_mat_cmplx_mult_q31(
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* small squared matrix specialized routines
*/
if (numRowsA == numColsB && numColsB == numColsA)
{
if (numRowsA == 1)
{
q63_t sumReal = (q63_t) pInA[0] * pInB[0];
sumReal -= (q63_t) pInA[1] * pInB[1];
q63_t sumImag = (q63_t) pInA[0] * pInB[1];
sumImag += (q63_t) pInA[1] * pInB[0];
/* Store result in destination buffer */
pOut[0] = (q31_t) clip_q63_to_q31(sumReal >> 31);
pOut[1] = (q31_t) clip_q63_to_q31(sumImag >> 31);
return (ARM_MATH_SUCCESS);
}
else if (numRowsA == 2)
return arm_mat_cmplx_mult_q31_2x2_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 3)
return arm_mat_cmplx_mult_q31_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_cmplx_mult_q31_4x4_mve(pSrcA, pSrcB, pDst);
}
vecColBOffs[0] = 0;
vecColBOffs[1] = 1;
vecColBOffs[2] = numColsB * CMPLX_DIM;
vecColBOffs[3] = (numColsB * CMPLX_DIM) + 1;
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
rowCnt = row >> 1;
while (rowCnt > 0u)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i * CMPLX_DIM;
i = i + 2 * numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (q31_t const *) pSrcB->pData;
/*
* column loop
*/
while (col > 0u)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
q31_t const *pSrcA0Vec, *pSrcA1Vec;
q31_t const *pInA0 = pInA;
q31_t const *pInA1 = pInA0 + numColsA * CMPLX_DIM;
q63_t acc0, acc1, acc2, acc3;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
pSrcA0Vec = (q31_t const *) pInA0;
pSrcA1Vec = (q31_t const *) pInA1;
vecOffs = vecColBOffs;
/*
* process 1 x 2 block output
*/
blkCnt = (numColsA * CMPLX_DIM) >> 2;
while (blkCnt > 0U)
{
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
/*
* move Matrix B read offsets, 2 rows down
*/
vecOffs = vecOffs + (uint32_t) (numColsB * 2 * CMPLX_DIM);
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 4;
acc0 = vmlsldavaq(acc0, vecA, vecB);
acc1 = vmlaldavaxq(acc1, vecA, vecB);
vecA = vld1q(pSrcA1Vec);
pSrcA1Vec += 4;
acc2 = vmlsldavaq(acc2, vecA, vecB);
acc3 = vmlaldavaxq(acc3, vecA, vecB);
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA * CMPLX_DIM) & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
/*
* move Matrix B read offsets, 2 rows down
*/
vecOffs = vecOffs + (uint32_t) (numColsB * 2 * CMPLX_DIM);
vecA = vld1q(pSrcA0Vec);
acc0 = vmlsldavaq(acc0, vecA, vecB);
acc1 = vmlaldavaxq(acc1, vecA, vecB);
vecA = vld1q(pSrcA1Vec);
acc2 = vmlsldavaq(acc2, vecA, vecB);
acc3 = vmlaldavaxq(acc3, vecA, vecB);
}
px[0 * CMPLX_DIM * numColsB + 0] = (q31_t) clip_q63_to_q31(acc0 >> 31);
px[0 * CMPLX_DIM * numColsB + 1] = (q31_t) clip_q63_to_q31(acc1 >> 31);
px[1 * CMPLX_DIM * numColsB + 0] = (q31_t) clip_q63_to_q31(acc2 >> 31);
px[1 * CMPLX_DIM * numColsB + 1] = (q31_t) clip_q63_to_q31(acc3 >> 31);
px += CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (q31_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += (numColsA * 2) * CMPLX_DIM;
/*
* Decrement the row loop counter
*/
rowCnt --;
}
rowCnt = row & 1;
while (rowCnt > 0u)
{
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i * CMPLX_DIM;
i = i + numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (q31_t const *) pSrcB->pData;
/*
* column loop
*/
while (col > 0u)
{
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
q31_t const *pSrcA0Vec;
q31_t const *pInA0 = pInA;
q63_t acc0,acc1;
acc0 = 0LL;
acc1 = 0LL;
pSrcA0Vec = (q31_t const *) pInA0;
vecOffs = vecColBOffs;
/*
* process 1 x 2 block output
*/
blkCnt = (numColsA * CMPLX_DIM) >> 2;
while (blkCnt > 0U)
{
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
/*
* move Matrix B read offsets, 2 rows down
*/
vecOffs = vecOffs + (uint32_t) (numColsB * 2 * CMPLX_DIM);
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 4;
acc0 = vmlsldavaq(acc0, vecA, vecB);
acc1 = vmlaldavaxq(acc1, vecA, vecB);
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA * CMPLX_DIM) & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
/*
* move Matrix B read offsets, 2 rows down
*/
vecOffs = vecOffs + (uint32_t) (numColsB * 2 * CMPLX_DIM);
vecA = vld1q(pSrcA0Vec);
acc0 = vmlsldavaq(acc0, vecA, vecB);
acc1 = vmlaldavaxq(acc1, vecA, vecB);
}
px[0] = (q31_t) clip_q63_to_q31(acc0 >> 31);
px[1] = (q31_t) clip_q63_to_q31(acc1 >> 31);
px += CMPLX_DIM;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (q31_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
}
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += numColsA * CMPLX_DIM;
rowCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_cmplx_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
q63_t sumReal, sumImag; /* Accumulator */
q31_t a1, b1, c1, d1;
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#if defined (ARM_MATH_LOOPUNROLL)
q31_t a0, b0, c0, d0;
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
@@ -119,16 +891,18 @@ arm_status arm_mat_cmplx_mult_q31(
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sumReal1 = 0.0;
sumImag1 = 0.0;
sumReal = 0.0;
sumImag = 0.0;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
/* Initiate pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
colCnt = numColsA >> 2;
#if defined (ARM_MATH_LOOPUNROLL)
/* matrix multiplication */
/* Apply loop unrolling and compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
@@ -145,76 +919,74 @@ arm_status arm_mat_cmplx_mult_q31(
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal1 += (q63_t) a0 *c0;
sumImag1 += (q63_t) b0 *c0;
sumReal += (q63_t) a0 * c0;
sumImag += (q63_t) b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal1 -= (q63_t) b0 *d0;
sumImag1 += (q63_t) a0 *d0;
sumReal -= (q63_t) b0 * d0;
sumImag += (q63_t) a0 * d0;
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
a1 = *pIn1;
c1 = *pIn2;
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal1 += (q63_t) a1 *c1;
sumImag1 += (q63_t) b1 *c1;
sumReal += (q63_t) a1 * c1;
sumImag += (q63_t) b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal1 -= (q63_t) b1 *d1;
sumImag1 += (q63_t) a1 *d1;
a0 = *pIn1;
c0 = *pIn2;
sumReal -= (q63_t) b1 * d1;
sumImag += (q63_t) a1 * d1;
a0 = *(pIn1 );
c0 = *(pIn2 );
b0 = *(pIn1 + 1U);
d0 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal1 += (q63_t) a0 *c0;
sumImag1 += (q63_t) b0 *c0;
sumReal += (q63_t) a0 * c0;
sumImag += (q63_t) b0 * c0;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal1 -= (q63_t) b0 *d0;
sumImag1 += (q63_t) a0 *d0;
sumReal -= (q63_t) b0 * d0;
sumImag += (q63_t) a0 * d0;
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
a1 = *pIn1;
c1 = *pIn2;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal1 += (q63_t) a1 *c1;
sumImag1 += (q63_t) b1 *c1;
sumReal += (q63_t) a1 * c1;
sumImag += (q63_t) b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal1 -= (q63_t) b1 *d1;
sumImag1 += (q63_t) a1 *d1;
sumReal -= (q63_t) b1 * d1;
sumImag += (q63_t) a1 * d1;
/* Decrement the loop count */
/* Decrement loop count */
colCnt--;
}
@@ -222,49 +994,55 @@ arm_status arm_mat_cmplx_mult_q31(
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
#else
/* Initialize blkCnt with number of samples */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
a1 = *pIn1;
c1 = *pIn2;
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
a1 = *(pIn1 );
c1 = *(pIn2 );
b1 = *(pIn1 + 1U);
d1 = *(pIn2 + 1U);
/* Multiply and Accumlates */
sumReal1 += (q63_t) a1 *c1;
sumImag1 += (q63_t) b1 *c1;
sumReal += (q63_t) a1 * c1;
sumImag += (q63_t) b1 * c1;
/* update pointers */
pIn1 += 2U;
pIn2 += 2 * numColsB;
/* Multiply and Accumlates */
sumReal1 -= (q63_t) b1 *d1;
sumImag1 += (q63_t) a1 *d1;
sumReal -= (q63_t) b1 * d1;
sumImag += (q63_t) a1 * d1;
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
/* Store the result in the destination buffer */
*px++ = (q31_t) clip_q63_to_q31(sumReal1 >> 31);
*px++ = (q31_t) clip_q63_to_q31(sumImag1 >> 31);
/* Store result in destination buffer */
*px++ = (q31_t) clip_q63_to_q31(sumReal >> 31);
*px++ = (q31_t) clip_q63_to_q31(sumImag >> 31);
/* Update the pointer pIn2 to point to the starting address of the next column */
/* Update pointer pIn2 to point to starting address of next column */
j++;
pIn2 = pSrcB->pData + 2U * j;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
} while (col > 0U);
/* Update the pointer pInA to point to the starting address of the next row */
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + 2 * numColsA;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
@@ -276,7 +1054,8 @@ arm_status arm_mat_cmplx_mult_q31(
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixMult group
@} end of MatrixMult group
*/

View File

@@ -0,0 +1,131 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cmplx_trans_f16.c
* Description: Floating-point complex matrix transpose
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixComplexTrans
@{
*/
/**
@brief Floating-point matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_cmplx_trans_f16(const arm_matrix_instance_f16 * pSrc, arm_matrix_instance_f16 * pDst)
{
return arm_mat_cmplx_trans_16bit(pSrc->numRows, pSrc->numCols, (uint16_t *) pSrc->pData,
pDst->numRows, pDst->numCols, (uint16_t *) pDst->pData);
}
#else
arm_status arm_mat_cmplx_trans_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nColumns = pSrc->numCols; /* number of columns */
uint16_t col, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* The pointer px is set to starting address of the column being processed */
px = pOut + CMPLX_DIM * i;
/* Initialize column loop counter */
col = nColumns;
while (col > 0U)
{
/* Read and store the input element in the destination */
px[0] = *pIn++; // real
px[1] = *pIn++; // imag
/* Update the pointer px to point to the next row of the transposed matrix */
px += CMPLX_DIM * nRows;
/* Decrement the column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixTrans group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,133 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cmplx_trans_f32.c
* Description: Floating-point complex matrix transpose
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixComplexTrans Complex Matrix Transpose
Tranposes a complex matrix.
Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
\image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
*/
/**
@addtogroup MatrixComplexTrans
@{
*/
/**
@brief Floating-point matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_cmplx_trans_f32(const arm_matrix_instance_f32 * pSrc, arm_matrix_instance_f32 * pDst)
{
return arm_mat_cmplx_trans_32bit(pSrc->numRows, pSrc->numCols, (uint32_t *) pSrc->pData,
pDst->numRows, pDst->numCols, (uint32_t *) pDst->pData);
}
#else
arm_status arm_mat_cmplx_trans_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of rows */
uint16_t nColumns = pSrc->numCols; /* number of columns */
uint16_t col, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* The pointer px is set to starting address of the column being processed */
px = pOut + CMPLX_DIM * i;
/* Initialize column loop counter */
col = nColumns;
while (col > 0U)
{
/* Read and store the input element in the destination */
px[0] = *pIn++; // real
px[1] = *pIn++; // imag
/* Update the pointer px to point to the next row of the transposed matrix */
px += CMPLX_DIM * nRows;
/* Decrement the column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
row--;
} while (row > 0U); /* row loop end */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixTrans group
*/

View File

@@ -0,0 +1,124 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cmplx_trans_q31.c
* Description: Q15 complex matrix transpose
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixComplexTrans
@{
*/
/**
@brief Q15 complex matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_cmplx_trans_q15(const arm_matrix_instance_q15 * pSrc, arm_matrix_instance_q15 * pDst)
{
return arm_mat_cmplx_trans_16bit(pSrc->numRows, pSrc->numCols, (uint16_t *) pSrc->pData,
pDst->numRows, pDst->numCols, (uint16_t *) pDst->pData);
}
#else
arm_status arm_mat_cmplx_trans_q15(
const arm_matrix_instance_q15 * pSrc,
arm_matrix_instance_q15 * pDst)
{
q15_t *pSrcA = pSrc->pData; /* input data matrix pointer */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of nRows */
uint16_t nColumns = pSrc->numCols; /* number of nColumns */
uint16_t col, row = nRows, i = 0U; /* row and column loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* The pointer pOut is set to starting address of the column being processed */
pOut = pDst->pData + CMPLX_DIM * i;
/* Initialize column loop counter */
col = nColumns;
while (col > 0U)
{
/* Read and store the input element in the destination */
pOut[0] = *pSrcA++; //real
pOut[1] = *pSrcA++; //imag
/* Update the pointer pOut to point to the next row of the transposed matrix */
pOut += CMPLX_DIM *nRows;
/* Decrement the column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
row--;
} while (row > 0U);
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixTrans group
*/

View File

@@ -0,0 +1,129 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_cmplx_trans_q31.c
* Description: Q31 complex matrix transpose
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixComplexTrans
@{
*/
/**
@brief Q31 complex matrix transpose.
@param[in] pSrc points to input matrix
@param[out] pDst points to output matrix
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_cmplx_trans_q31(const arm_matrix_instance_q31 * pSrc, arm_matrix_instance_q31 * pDst)
{
return arm_mat_cmplx_trans_32bit(pSrc->numRows, pSrc->numCols, (uint32_t *) pSrc->pData,
pDst->numRows, pDst->numCols, (uint32_t *) pDst->pData);
}
#else
arm_status arm_mat_cmplx_trans_q31(
const arm_matrix_instance_q31 * pSrc,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn = pSrc->pData; /* input data matrix pointer */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t nRows = pSrc->numRows; /* number of nRows */
uint16_t nColumns = pSrc->numCols; /* number of nColumns */
uint16_t col, i = 0U, row = nRows; /* loop counters */
arm_status status; /* status of matrix transpose */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose by exchanging the rows with columns */
/* row loop */
do
{
/* The pointer px is set to starting address of the column being processed */
px = pOut + CMPLX_DIM * i;
/* Initialize column loop counter */
col = nColumns;
while (col > 0U)
{
/* Read and store the input element in the destination */
px[0] = *pIn++; // real
px[1] = *pIn++; // imag
/* Update the pointer px to point to the next row of the transposed matrix */
px += CMPLX_DIM * nRows;
/* Decrement the column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
row--;
}
while (row > 0U); /* row loop end */
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixTrans group
*/

View File

@@ -0,0 +1,74 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_init_f16.c
* Description: Floating-point matrix initialization
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInit
@{
*/
/**
@brief Floating-point matrix initialization.
@param[in,out] S points to an instance of the floating-point matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_f16(
arm_matrix_instance_f16 * S,
uint16_t nRows,
uint16_t nColumns,
float16_t * pData)
{
/* Assign Number of Rows */
S->numRows = nRows;
/* Assign Number of Columns */
S->numCols = nColumns;
/* Assign Data pointer */
S->pData = pData;
}
/**
@} end of MatrixInit group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_init_f32.c
* Description: Floating-point matrix initialization
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,34 +26,34 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @defgroup MatrixInit Matrix Initialization
*
* Initializes the underlying matrix data structure.
* The functions set the <code>numRows</code>,
* <code>numCols</code>, and <code>pData</code> fields
* of the matrix data structure.
@defgroup MatrixInit Matrix Initialization
Initializes the underlying matrix data structure.
The functions set the <code>numRows</code>,
<code>numCols</code>, and <code>pData</code> fields
of the matrix data structure.
*/
/**
* @addtogroup MatrixInit
* @{
@addtogroup MatrixInit
@{
*/
/**
* @brief Floating-point matrix initialization.
* @param[in,out] *S points to an instance of the floating-point matrix structure.
* @param[in] nRows number of rows in the matrix.
* @param[in] nColumns number of columns in the matrix.
* @param[in] *pData points to the matrix data array.
* @return none
*/
@brief Floating-point matrix initialization.
@param[in,out] S points to an instance of the floating-point matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_f32(
arm_matrix_instance_f32 * S,
@@ -72,5 +72,5 @@ void arm_mat_init_f32(
}
/**
* @} end of MatrixInit group
@} end of MatrixInit group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_init_q15.c
* Description: Q15 matrix initialization
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,25 +26,25 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixInit
* @{
@addtogroup MatrixInit
@{
*/
/**
* @brief Q15 matrix initialization.
* @param[in,out] *S points to an instance of the floating-point matrix structure.
* @param[in] nRows number of rows in the matrix.
* @param[in] nColumns number of columns in the matrix.
* @param[in] *pData points to the matrix data array.
* @return none
*/
/**
@brief Q15 matrix initialization.
@param[in,out] S points to an instance of the floating-point matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_q15(
arm_matrix_instance_q15 * S,
@@ -63,5 +63,5 @@ void arm_mat_init_q15(
}
/**
* @} end of MatrixInit group
@} end of MatrixInit group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_init_q31.c
* Description: Q31 matrix initialization
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,30 +26,30 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @defgroup MatrixInit Matrix Initialization
*
@defgroup MatrixInit Matrix Initialization
*/
/**
* @addtogroup MatrixInit
* @{
@addtogroup MatrixInit
@{
*/
/**
* @brief Q31 matrix initialization.
* @param[in,out] *S points to an instance of the floating-point matrix structure.
* @param[in] nRows number of rows in the matrix.
* @param[in] nColumns number of columns in the matrix.
* @param[in] *pData points to the matrix data array.
* @return none
*/
/**
@brief Q31 matrix initialization.
@param[in,out] S points to an instance of the Q31 matrix structure
@param[in] nRows number of rows in the matrix
@param[in] nColumns number of columns in the matrix
@param[in] pData points to the matrix data array
@return none
*/
void arm_mat_init_q31(
arm_matrix_instance_q31 * S,
@@ -68,5 +68,5 @@ void arm_mat_init_q31(
}
/**
* @} end of MatrixInit group
@} end of MatrixInit group
*/

View File

@@ -0,0 +1,891 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_inverse_f16.c
* Description: Floating-point matrix inverse
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
@brief Floating-point matrix inverse.
@param[in] pSrc points to input matrix structure. The source matrix is modified by the function.
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_inverse_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
float16_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
float16_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
float16_t *pTmpA, *pTmpB;
_Float16 in = 0.0f16; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, l; /* loop counters */
arm_status status; /* status of matrix inverse */
uint32_t blkCnt;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
|| (pSrc->numRows != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
/*
* Working pointer for destination matrix
*/
pOutT1 = pOut;
/*
* Loop over the number of rows
*/
rowCnt = numRows;
/*
* Making the destination matrix as identity matrix
*/
while (rowCnt > 0U)
{
/*
* Writing all zeroes in lower triangle of the destination matrix
*/
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f16;
j--;
}
/*
* Writing all ones in the diagonal of the destination matrix
*/
*pOutT1++ = 1.0f16;
/*
* Writing all zeroes in upper triangle of the destination matrix
*/
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f16;
j--;
}
/*
* Decrement the loop counter
*/
rowCnt--;
}
/*
* Loop over the number of columns of the input matrix.
* All the elements in each column are processed by the row operations
*/
loopCnt = numCols;
/*
* Index modifier to navigate through the columns
*/
l = 0U;
while (loopCnt > 0U)
{
/*
* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular.
*/
/*
* Working pointer for the input matrix that points
* * to the pivot element of the particular row
*/
pInT1 = pIn + (l * numCols);
/*
* Working pointer for the destination matrix that points
* * to the pivot element of the particular row
*/
pOutT1 = pOut + (l * numCols);
/*
* Temporary variable to hold the pivot value
*/
in = *pInT1;
/*
* Check if the pivot element is zero
*/
if ((_Float16)*pInT1 == 0.0f16)
{
/*
* Loop over the number rows present below
*/
for (i = 1U; i < numRows-l; i++)
{
/*
* Update the input and destination pointers
*/
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * i);
/*
* Check if there is a non zero pivot element to
* * replace in the rows below
*/
if ((_Float16)*pInT2 != 0.0f16)
{
f16x8_t vecA, vecB;
/*
* Loop over number of columns
* * to the right of the pilot element
*/
pTmpA = pInT1;
pTmpB = pInT2;
blkCnt = (numCols - l) >> 3;
while (blkCnt > 0U)
{
vecA = vldrhq_f16(pTmpA);
vecB = vldrhq_f16(pTmpB);
vstrhq_f16(pTmpB, vecA);
vstrhq_f16(pTmpA, vecB);
pTmpA += 8;
pTmpB += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numCols - l) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vldrhq_f16(pTmpA);
vecB = vldrhq_f16(pTmpB);
vstrhq_p_f16(pTmpB, vecA, p0);
vstrhq_p_f16(pTmpA, vecB, p0);
}
pInT1 += numCols - l;
pInT2 += numCols - l;
pTmpA = pOutT1;
pTmpB = pOutT2;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
vecA = vldrhq_f16(pTmpA);
vecB = vldrhq_f16(pTmpB);
vstrhq_f16(pTmpB, vecA);
vstrhq_f16(pTmpA, vecB);
pTmpA += 8;
pTmpB += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vldrhq_f16(pTmpA);
vecB = vldrhq_f16(pTmpB);
vstrhq_p_f16(pTmpB, vecA, p0);
vstrhq_p_f16(pTmpA, vecB, p0);
}
pOutT1 += numCols;
pOutT2 += numCols;
/*
* Flag to indicate whether exchange is done or not
*/
flag = 1U;
/*
* Break after exchange is done
*/
break;
}
}
}
/*
* Update the status if the matrix is singular
*/
if ((flag != 1U) && (in == 0.0f16))
{
return ARM_MATH_SINGULAR;
}
/*
* Points to the pivot row of input and destination matrices
*/
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/*
* Temporary pointers to the pivot row pointers
*/
pInT1 = pPivotRowIn;
pOutT1 = pPivotRowDst;
/*
* Pivot element of the row
*/
in = *(pIn + (l * numCols));
pTmpA = pInT1;
f16x8_t invIn = vdupq_n_f16(1.0f16 / in);
blkCnt = (numCols - l) >> 3;
f16x8_t vecA;
while (blkCnt > 0U)
{
*(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA * invIn;
pTmpA += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = (numCols - l) & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vldrhq_f16(pTmpA);
vecA = vecA * invIn;
vstrhq_p_f16(pTmpA, vecA, p0);
}
pInT1 += numCols - l;
/*
* Loop over number of columns
* * to the right of the pilot element
*/
pTmpA = pOutT1;
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
*(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA *invIn;
pTmpA += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vldrhq_f16(pTmpA);
vecA = vecA * invIn;
vstrhq_p_f16(pTmpA, vecA, p0);
}
pOutT1 += numCols;
/*
* Replace the rows with the sum of that row and a multiple of row i
* * so that each new element in column i above row i is zero.
*/
/*
* Temporary pointers for input and destination matrices
*/
pInT1 = pIn;
pOutT1 = pOut;
for (i = 0U; i < numRows; i++)
{
/*
* Check for the pivot element
*/
if (i == l)
{
/*
* If the processing element is the pivot element,
* only the columns to the right are to be processed
*/
pInT1 += numCols - l;
pOutT1 += numCols;
}
else
{
/*
* Element of the reference row
*/
/*
* Working pointers for input and destination pivot rows
*/
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/*
* Loop over the number of columns to the right of the pivot element,
* to replace the elements in the input matrix
*/
in = *pInT1;
f16x8_t tmpV = vdupq_n_f16(in);
blkCnt = (numCols - l) >> 3;
while (blkCnt > 0U)
{
f16x8_t vec1, vec2;
/*
* Replace the element by the sum of that row
* and a multiple of the reference row
*/
vec1 = vldrhq_f16(pInT1);
vec2 = vldrhq_f16(pPRT_in);
vec1 = vfmsq_f16(vec1, tmpV, vec2);
vstrhq_f16(pInT1, vec1);
pPRT_in += 8;
pInT1 += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numCols - l) & 7;
if (blkCnt > 0U)
{
f16x8_t vec1, vec2;
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vldrhq_f16(pInT1);
vec2 = vldrhq_f16(pPRT_in);
vec1 = vfmsq_f16(vec1, tmpV, vec2);
vstrhq_p_f16(pInT1, vec1, p0);
pInT1 += blkCnt;
}
blkCnt = numCols >> 3;
while (blkCnt > 0U)
{
f16x8_t vec1, vec2;
/*
* Replace the element by the sum of that row
* and a multiple of the reference row
*/
vec1 = vldrhq_f16(pOutT1);
vec2 = vldrhq_f16(pPRT_pDst);
vec1 = vfmsq_f16(vec1, tmpV, vec2);
vstrhq_f16(pOutT1, vec1);
pPRT_pDst += 8;
pOutT1 += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 7;
if (blkCnt > 0U)
{
f16x8_t vec1, vec2;
mve_pred16_t p0 = vctp16q(blkCnt);
vec1 = vldrhq_f16(pOutT1);
vec2 = vldrhq_f16(pPRT_pDst);
vec1 = vfmsq_f16(vec1, tmpV, vec2);
vstrhq_p_f16(pOutT1, vec1, p0);
pInT2 += blkCnt;
pOutT1 += blkCnt;
}
}
/*
* Increment the temporary input pointer
*/
pInT1 = pInT1 + l;
}
/*
* Increment the input pointer
*/
pIn++;
/*
* Decrement the loop counter
*/
loopCnt--;
/*
* Increment the index modifier
*/
l++;
}
/*
* Set status as ARM_MATH_SUCCESS
*/
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0f16))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if ((_Float16)pIn[i] != 0.0f16)
break;
}
if (i == numRows * numCols)
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_inverse_f16(
const arm_matrix_instance_f16 * pSrc,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
float16_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
float16_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
_Float16 Xchg, in = 0.0f16, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
/* Loop over the number of rows */
rowCnt = numRows;
/* Making the destination matrix as identity matrix */
while (rowCnt > 0U)
{
/* Writing all zeroes in lower triangle of the destination matrix */
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f16;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0f16;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f16;
j--;
}
/* Decrement loop counter */
rowCnt--;
}
/* Loop over the number of columns of the input matrix.
All the elements in each column are processed by the row operations */
loopCnt = numCols;
/* Index modifier to navigate through the columns */
l = 0U;
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular. */
/* Working pointer for the input matrix that points
* to the pivot element of the particular row */
pInT1 = pIn + (l * numCols);
/* Working pointer for the destination matrix that points
* to the pivot element of the particular row */
pOutT1 = pOut + (l * numCols);
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Check if the pivot element is zero */
if ((_Float16)*pInT1 == 0.0f16)
{
/* Loop over the number rows present below */
for (i = 1U; i < numRows-l; i++)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * i);
/* Check if there is a non zero pivot element to
* replace in the rows below */
if ((_Float16)*pInT2 != 0.0f16)
{
/* Loop over number of columns
* to the right of the pilot element */
j = numCols - l;
while (j > 0U)
{
/* Exchange the row elements of the input matrix */
Xchg = *pInT2;
*pInT2++ = *pInT1;
*pInT1++ = Xchg;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols;
while (j > 0U)
{
/* Exchange the row elements of the destination matrix */
Xchg = *pOutT2;
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
/* Decrement loop counter */
j--;
}
/* Flag to indicate whether exchange is done or not */
flag = 1U;
/* Break after exchange is done */
break;
}
}
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0f16))
{
return ARM_MATH_SINGULAR;
}
/* Points to the pivot row of input and destination matrices */
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/* Temporary pointers to the pivot row pointers */
pInT1 = pPivotRowIn;
pInT2 = pPivotRowDst;
/* Pivot element of the row */
in = *pPivotRowIn;
/* Loop over number of columns
* to the right of the pilot element */
j = (numCols - l);
while (j > 0U)
{
/* Divide each element of the row of the input matrix
* by the pivot element */
in1 = *pInT1;
*pInT1++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols;
while (j > 0U)
{
/* Divide each element of the row of the destination matrix
* by the pivot element */
in1 = *pInT2;
*pInT2++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Replace the rows with the sum of that row and a multiple of row i
* so that each new element in column i above row i is zero.*/
/* Temporary pointers for input and destination matrices */
pInT1 = pIn;
pInT2 = pOut;
/* index used to check for pivot element */
i = 0U;
/* Loop over number of rows */
/* to be replaced by the sum of that row and a multiple of row i */
k = numRows;
while (k > 0U)
{
/* Check for the pivot element */
if (i == l)
{
/* If the processing element is the pivot element,
only the columns to the right are to be processed */
pInT1 += numCols - l;
pInT2 += numCols;
}
else
{
/* Element of the reference row */
in = *pInT1;
/* Working pointers for input and destination pivot rows */
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/* Loop over the number of columns to the right of the pivot element,
to replace the elements in the input matrix */
j = (numCols - l);
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT1;
*pInT1++ = (_Float16)in1 - ((_Float16)in * (_Float16)*pPRT_in++);
/* Decrement the loop counter */
j--;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
j = numCols;
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT2;
*pInT2++ = (_Float16)in1 - ((_Float16)in * (_Float16)*pPRT_pDst++);
/* Decrement loop counter */
j--;
}
}
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
/* Decrement loop counter */
k--;
/* Increment pivot index */
i++;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && ((_Float16)in == 0.0f16))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if ((_Float16)pIn[i] != 0.0f16)
break;
}
if (i == numRows * numCols)
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixInv group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_inverse_f32.c
* Description: Floating-point matrix inverse
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,50 +26,548 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @defgroup MatrixInv Matrix Inverse
*
* Computes the inverse of a matrix.
*
* The inverse is defined only if the input matrix is square and non-singular (the determinant
* is non-zero). The function checks that the input and output matrices are square and of the
* same size.
*
* Matrix inversion is numerically sensitive and the CMSIS DSP library only supports matrix
* inversion of floating-point matrices.
*
* \par Algorithm
* The Gauss-Jordan method is used to find the inverse.
* The algorithm performs a sequence of elementary row-operations until it
* reduces the input matrix to an identity matrix. Applying the same sequence
* of elementary row-operations to an identity matrix yields the inverse matrix.
* If the input matrix is singular, then the algorithm terminates and returns error status
* <code>ARM_MATH_SINGULAR</code>.
* \image html MatrixInverse.gif "Matrix Inverse of a 3 x 3 matrix using Gauss-Jordan Method"
@defgroup MatrixInv Matrix Inverse
Computes the inverse of a matrix.
The inverse is defined only if the input matrix is square and non-singular (the determinant is non-zero).
The function checks that the input and output matrices are square and of the same size.
Matrix inversion is numerically sensitive and the CMSIS DSP library only supports matrix
inversion of floating-point matrices.
@par Algorithm
The Gauss-Jordan method is used to find the inverse.
The algorithm performs a sequence of elementary row-operations until it
reduces the input matrix to an identity matrix. Applying the same sequence
of elementary row-operations to an identity matrix yields the inverse matrix.
If the input matrix is singular, then the algorithm terminates and returns error status
<code>ARM_MATH_SINGULAR</code>.
\image html MatrixInverse.gif "Matrix Inverse of a 3 x 3 matrix using Gauss-Jordan Method"
*/
/**
* @addtogroup MatrixInv
* @{
@addtogroup MatrixInv
@{
*/
/**
* @brief Floating-point matrix inverse.
* @param[in] *pSrc points to input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns
* <code>ARM_MATH_SIZE_MISMATCH</code> if the input matrix is not square or if the size
* of the output matrix does not match the size of the input matrix.
* If the input matrix is found to be singular (non-invertible), then the function returns
* <code>ARM_MATH_SINGULAR</code>. Otherwise, the function returns <code>ARM_MATH_SUCCESS</code>.
@brief Floating-point matrix inverse.
@param[in] pSrc points to input matrix structure. The source matrix is modified by the function.
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_inverse_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
float32_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
float32_t *pTmpA, *pTmpB;
float32_t in = 0.0f; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, l; /* loop counters */
arm_status status; /* status of matrix inverse */
uint32_t blkCnt;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
|| (pSrc->numRows != pDst->numRows))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
/*
* Working pointer for destination matrix
*/
pOutT1 = pOut;
/*
* Loop over the number of rows
*/
rowCnt = numRows;
/*
* Making the destination matrix as identity matrix
*/
while (rowCnt > 0U)
{
/*
* Writing all zeroes in lower triangle of the destination matrix
*/
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f;
j--;
}
/*
* Writing all ones in the diagonal of the destination matrix
*/
*pOutT1++ = 1.0f;
/*
* Writing all zeroes in upper triangle of the destination matrix
*/
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f;
j--;
}
/*
* Decrement the loop counter
*/
rowCnt--;
}
/*
* Loop over the number of columns of the input matrix.
* All the elements in each column are processed by the row operations
*/
loopCnt = numCols;
/*
* Index modifier to navigate through the columns
*/
l = 0U;
while (loopCnt > 0U)
{
/*
* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular.
*/
/*
* Working pointer for the input matrix that points
* * to the pivot element of the particular row
*/
pInT1 = pIn + (l * numCols);
/*
* Working pointer for the destination matrix that points
* * to the pivot element of the particular row
*/
pOutT1 = pOut + (l * numCols);
/*
* Temporary variable to hold the pivot value
*/
in = *pInT1;
/*
* Check if the pivot element is zero
*/
if (*pInT1 == 0.0f)
{
/*
* Loop over the number rows present below
*/
for (i = 1U; i < numRows-l; i++)
{
/*
* Update the input and destination pointers
*/
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * i);
/*
* Check if there is a non zero pivot element to
* * replace in the rows below
*/
if (*pInT2 != 0.0f)
{
f32x4_t vecA, vecB;
/*
* Loop over number of columns
* * to the right of the pilot element
*/
pTmpA = pInT1;
pTmpB = pInT2;
blkCnt = (numCols - l) >> 2;
while (blkCnt > 0U)
{
vecA = vldrwq_f32(pTmpA);
vecB = vldrwq_f32(pTmpB);
vstrwq_f32(pTmpB, vecA);
vstrwq_f32(pTmpA, vecB);
pTmpA += 4;
pTmpB += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numCols - l) & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vldrwq_f32(pTmpA);
vecB = vldrwq_f32(pTmpB);
vstrwq_p_f32(pTmpB, vecA, p0);
vstrwq_p_f32(pTmpA, vecB, p0);
}
pInT1 += numCols - l;
pInT2 += numCols - l;
pTmpA = pOutT1;
pTmpB = pOutT2;
blkCnt = numCols >> 2;
while (blkCnt > 0U)
{
vecA = vldrwq_f32(pTmpA);
vecB = vldrwq_f32(pTmpB);
vstrwq_f32(pTmpB, vecA);
vstrwq_f32(pTmpA, vecB);
pTmpA += 4;
pTmpB += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numCols & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vldrwq_f32(pTmpA);
vecB = vldrwq_f32(pTmpB);
vstrwq_p_f32(pTmpB, vecA, p0);
vstrwq_p_f32(pTmpA, vecB, p0);
}
pOutT1 += numCols;
pOutT2 += numCols;
/*
* Flag to indicate whether exchange is done or not
*/
flag = 1U;
/*
* Break after exchange is done
*/
break;
}
}
}
/*
* Update the status if the matrix is singular
*/
if ((flag != 1U) && (in == 0.0f))
{
return ARM_MATH_SINGULAR;
}
/*
* Points to the pivot row of input and destination matrices
*/
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/*
* Temporary pointers to the pivot row pointers
*/
pInT1 = pPivotRowIn;
pOutT1 = pPivotRowDst;
/*
* Pivot element of the row
*/
in = *(pIn + (l * numCols));
pTmpA = pInT1;
f32x4_t invIn = vdupq_n_f32(1.0f / in);
blkCnt = (numCols - l) >> 2;
f32x4_t vecA;
while (blkCnt > 0U)
{
*(f32x4_t *) pTmpA = *(f32x4_t *) pTmpA * invIn;
pTmpA += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = (numCols - l) & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vldrwq_f32(pTmpA);
vecA = vecA * invIn;
vstrwq_p_f32(pTmpA, vecA, p0);
}
pInT1 += numCols - l;
/*
* Loop over number of columns
* * to the right of the pilot element
*/
pTmpA = pOutT1;
blkCnt = numCols >> 2;
while (blkCnt > 0U)
{
*(f32x4_t *) pTmpA = *(f32x4_t *) pTmpA *invIn;
pTmpA += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vldrwq_f32(pTmpA);
vecA = vecA * invIn;
vstrwq_p_f32(pTmpA, vecA, p0);
}
pOutT1 += numCols;
/*
* Replace the rows with the sum of that row and a multiple of row i
* * so that each new element in column i above row i is zero.
*/
/*
* Temporary pointers for input and destination matrices
*/
pInT1 = pIn;
pOutT1 = pOut;
for (i = 0U; i < numRows; i++)
{
/*
* Check for the pivot element
*/
if (i == l)
{
/*
* If the processing element is the pivot element,
* only the columns to the right are to be processed
*/
pInT1 += numCols - l;
pOutT1 += numCols;
}
else
{
/*
* Element of the reference row
*/
/*
* Working pointers for input and destination pivot rows
*/
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/*
* Loop over the number of columns to the right of the pivot element,
* to replace the elements in the input matrix
*/
in = *pInT1;
f32x4_t tmpV = vdupq_n_f32(in);
blkCnt = (numCols - l) >> 2;
while (blkCnt > 0U)
{
f32x4_t vec1, vec2;
/*
* Replace the element by the sum of that row
* and a multiple of the reference row
*/
vec1 = vldrwq_f32(pInT1);
vec2 = vldrwq_f32(pPRT_in);
vec1 = vfmsq_f32(vec1, tmpV, vec2);
vstrwq_f32(pInT1, vec1);
pPRT_in += 4;
pInT1 += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numCols - l) & 3;
if (blkCnt > 0U)
{
f32x4_t vec1, vec2;
mve_pred16_t p0 = vctp32q(blkCnt);
vec1 = vldrwq_f32(pInT1);
vec2 = vldrwq_f32(pPRT_in);
vec1 = vfmsq_f32(vec1, tmpV, vec2);
vstrwq_p_f32(pInT1, vec1, p0);
pInT1 += blkCnt;
}
blkCnt = numCols >> 2;
while (blkCnt > 0U)
{
f32x4_t vec1, vec2;
/*
* Replace the element by the sum of that row
* and a multiple of the reference row
*/
vec1 = vldrwq_f32(pOutT1);
vec2 = vldrwq_f32(pPRT_pDst);
vec1 = vfmsq_f32(vec1, tmpV, vec2);
vstrwq_f32(pOutT1, vec1);
pPRT_pDst += 4;
pOutT1 += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numCols & 3;
if (blkCnt > 0U)
{
f32x4_t vec1, vec2;
mve_pred16_t p0 = vctp32q(blkCnt);
vec1 = vldrwq_f32(pOutT1);
vec2 = vldrwq_f32(pPRT_pDst);
vec1 = vfmsq_f32(vec1, tmpV, vec2);
vstrwq_p_f32(pOutT1, vec1, p0);
pInT2 += blkCnt;
pOutT1 += blkCnt;
}
}
/*
* Increment the temporary input pointer
*/
pInT1 = pInT1 + l;
}
/*
* Increment the input pointer
*/
pIn++;
/*
* Decrement the loop counter
*/
loopCnt--;
/*
* Increment the index modifier
*/
l++;
}
/*
* Set status as ARM_MATH_SUCCESS
*/
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0f))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if (pIn[i] != 0.0f)
break;
}
if (i == numRows * numCols)
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON)
arm_status arm_mat_inverse_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
@@ -82,18 +580,16 @@ arm_status arm_mat_inverse_f32(
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
#if defined (ARM_MATH_DSP)
float32_t maxC; /* maximum value in the column */
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t Xchg, in = 0.0f, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
arm_status status; /* status of matrix inverse */
float32x4_t vec1;
float32x4_t vec2;
float32x4_t tmpV;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
|| (pSrc->numRows != pDst->numRows))
@@ -105,42 +601,40 @@ arm_status arm_mat_inverse_f32(
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for column i is the greatest of the column.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is not the most significant of the columns, exchange that row with a row
* below it that does contain the most significant value in column i. If the most
* significant value of the column is zero, then an inverse to that matrix does not exist.
* The most significant value of the column is the absolute maximum.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
@@ -164,6 +658,7 @@ arm_status arm_mat_inverse_f32(
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f;
@@ -199,41 +694,19 @@ arm_status arm_mat_inverse_f32(
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Grab the most significant value from column l */
maxC = 0;
for (i = l; i < numRows; i++)
{
maxC = *pInT1 > 0 ? (*pInT1 > maxC ? *pInT1 : maxC) : (-*pInT1 > maxC ? -*pInT1 : maxC);
pInT1 += numCols;
}
/* Update the status if the matrix is singular */
if (maxC == 0.0f)
{
return ARM_MATH_SINGULAR;
}
/* Restore pInT1 */
pInT1 = pIn;
/* Destination pointer modifier */
k = 1U;
/* Check if the pivot element is the most significant of the column */
if ( (in > 0.0f ? in : -in) != maxC)
/* Check if the pivot element is zero */
if (*pInT1 == 0.0f)
{
/* Loop over the number rows present below */
i = numRows - (l + 1U);
while (i > 0U)
for (i = 1U; i < numRows - l; i++)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * l);
pOutT2 = pOutT1 + (numCols * k);
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * i);
/* Look for the most significant element to
/* Check if there is a non zero pivot element to
* replace in the rows below */
if ((*pInT2 > 0.0f ? *pInT2: -*pInT2) == maxC)
if (*pInT2 != 0.0f)
{
/* Loop over number of columns
* to the right of the pilot element */
@@ -271,11 +744,415 @@ arm_status arm_mat_inverse_f32(
break;
}
/* Update the destination pointer modifier */
k++;
}
}
/* Decrement the loop counter */
i--;
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0f))
{
return ARM_MATH_SINGULAR;
}
/* Points to the pivot row of input and destination matrices */
pPivotRowIn = pIn + (l * numCols);
pPivotRowDst = pOut + (l * numCols);
/* Temporary pointers to the pivot row pointers */
pInT1 = pPivotRowIn;
pInT2 = pPivotRowDst;
/* Pivot element of the row */
in = *pPivotRowIn;
tmpV = vdupq_n_f32(1.0f/in);
/* Loop over number of columns
* to the right of the pilot element */
j = (numCols - l) >> 2;
while (j > 0U)
{
/* Divide each element of the row of the input matrix
* by the pivot element */
vec1 = vld1q_f32(pInT1);
vec1 = vmulq_f32(vec1, tmpV);
vst1q_f32(pInT1, vec1);
pInT1 += 4;
/* Decrement the loop counter */
j--;
}
/* Tail */
j = (numCols - l) & 3;
while (j > 0U)
{
/* Divide each element of the row of the input matrix
* by the pivot element */
in1 = *pInT1;
*pInT1++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols >> 2;
while (j > 0U)
{
/* Divide each element of the row of the destination matrix
* by the pivot element */
vec1 = vld1q_f32(pInT2);
vec1 = vmulq_f32(vec1, tmpV);
vst1q_f32(pInT2, vec1);
pInT2 += 4;
/* Decrement the loop counter */
j--;
}
/* Tail */
j = numCols & 3;
while (j > 0U)
{
/* Divide each element of the row of the destination matrix
* by the pivot element */
in1 = *pInT2;
*pInT2++ = in1 / in;
/* Decrement the loop counter */
j--;
}
/* Replace the rows with the sum of that row and a multiple of row i
* so that each new element in column i above row i is zero.*/
/* Temporary pointers for input and destination matrices */
pInT1 = pIn;
pInT2 = pOut;
/* index used to check for pivot element */
i = 0U;
/* Loop over number of rows */
/* to be replaced by the sum of that row and a multiple of row i */
k = numRows;
while (k > 0U)
{
/* Check for the pivot element */
if (i == l)
{
/* If the processing element is the pivot element,
only the columns to the right are to be processed */
pInT1 += numCols - l;
pInT2 += numCols;
}
else
{
/* Element of the reference row */
in = *pInT1;
tmpV = vdupq_n_f32(in);
/* Working pointers for input and destination pivot rows */
pPRT_in = pPivotRowIn;
pPRT_pDst = pPivotRowDst;
/* Loop over the number of columns to the right of the pivot element,
to replace the elements in the input matrix */
j = (numCols - l) >> 2;
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
vec1 = vld1q_f32(pInT1);
vec2 = vld1q_f32(pPRT_in);
vec1 = vmlsq_f32(vec1, tmpV, vec2);
vst1q_f32(pInT1, vec1);
pPRT_in += 4;
pInT1 += 4;
/* Decrement the loop counter */
j--;
}
/* Tail */
j = (numCols - l) & 3;
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT1;
*pInT1++ = in1 - (in * *pPRT_in++);
/* Decrement the loop counter */
j--;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
j = numCols >> 2;
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
vec1 = vld1q_f32(pInT2);
vec2 = vld1q_f32(pPRT_pDst);
vec1 = vmlsq_f32(vec1, tmpV, vec2);
vst1q_f32(pInT2, vec1);
pPRT_pDst += 4;
pInT2 += 4;
/* Decrement the loop counter */
j--;
}
/* Tail */
j = numCols & 3;
while (j > 0U)
{
/* Replace the element by the sum of that row
and a multiple of the reference row */
in1 = *pInT2;
*pInT2++ = in1 - (in * *pPRT_pDst++);
/* Decrement the loop counter */
j--;
}
}
/* Increment the temporary input pointer */
pInT1 = pInT1 + l;
/* Decrement the loop counter */
k--;
/* Increment the pivot index */
i++;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0f))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if (pIn[i] != 0.0f)
break;
}
if (i == numRows * numCols)
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_inverse_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
float32_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
#if defined (ARM_MATH_DSP)
float32_t Xchg, in = 0.0f, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
/* Loop over the number of rows */
rowCnt = numRows;
/* Making the destination matrix as identity matrix */
while (rowCnt > 0U)
{
/* Writing all zeroes in lower triangle of the destination matrix */
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0f;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f;
j--;
}
/* Decrement loop counter */
rowCnt--;
}
/* Loop over the number of columns of the input matrix.
All the elements in each column are processed by the row operations */
loopCnt = numCols;
/* Index modifier to navigate through the columns */
l = 0U;
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
* If it is zero then interchange the row with non zero row below.
* If there is no non zero element to replace in the rows below,
* then the matrix is Singular. */
/* Working pointer for the input matrix that points
* to the pivot element of the particular row */
pInT1 = pIn + (l * numCols);
/* Working pointer for the destination matrix that points
* to the pivot element of the particular row */
pOutT1 = pOut + (l * numCols);
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Check if the pivot element is zero */
if (*pInT1 == 0.0f)
{
/* Loop over the number rows present below */
for (i = 1U; i < numRows - l; i++)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * i);
/* Check if there is a non zero pivot element to
* replace in the rows below */
if (*pInT2 != 0.0f)
{
/* Loop over number of columns
* to the right of the pilot element */
j = numCols - l;
while (j > 0U)
{
/* Exchange the row elements of the input matrix */
Xchg = *pInT2;
*pInT2++ = *pInT1;
*pInT1++ = Xchg;
/* Decrement the loop counter */
j--;
}
/* Loop over number of columns of the destination matrix */
j = numCols;
while (j > 0U)
{
/* Exchange the row elements of the destination matrix */
Xchg = *pOutT2;
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
/* Decrement loop counter */
j--;
}
/* Flag to indicate whether exchange is done or not */
flag = 1U;
/* Break after exchange is done */
break;
}
/* Decrement loop counter */
}
}
@@ -385,19 +1262,19 @@ arm_status arm_mat_inverse_f32(
in1 = *pInT2;
*pInT2++ = in1 - (in * *pPRT_pDst++);
/* Decrement the loop counter */
/* Decrement loop counter */
j--;
}
}
/* Increment the temporary input pointer */
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
/* Decrement the loop counter */
/* Decrement loop counter */
k--;
/* Increment the pivot index */
/* Increment pivot index */
i++;
}
@@ -414,59 +1291,60 @@ arm_status arm_mat_inverse_f32(
#else
/* Run the below code for Cortex-M0 */
float32_t Xchg, in = 0.0f; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
|| (pSrc->numRows != pDst->numRows))
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
@@ -496,7 +1374,7 @@ arm_status arm_mat_inverse_f32(
j--;
}
/* Decrement the loop counter */
/* Decrement loop counter */
rowCnt--;
}
@@ -506,7 +1384,7 @@ arm_status arm_mat_inverse_f32(
/* Index modifier to navigate through the columns */
l = 0U;
//for(loopCnt = 0U; loopCnt < numCols; loopCnt++)
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
@@ -525,18 +1403,15 @@ arm_status arm_mat_inverse_f32(
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Destination pointer modifier */
k = 1U;
/* Check if the pivot element is zero */
if (*pInT1 == 0.0f)
{
/* Loop over the number rows present below */
for (i = (l + 1U); i < numRows; i++)
for (i = 1U; i < numRows-l; i++)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * l);
pOutT2 = pOutT1 + (numCols * k);
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * i);
/* Check if there is a non zero pivot element to
* replace in the rows below */
@@ -565,12 +1440,10 @@ arm_status arm_mat_inverse_f32(
/* Break after exchange is done */
break;
}
/* Update the destination pointer modifier */
k++;
}
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0f))
{
@@ -640,6 +1513,7 @@ arm_status arm_mat_inverse_f32(
*pInT1 = *pInT1 - (in * *pPRT_in++);
pInT1++;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
for (j = 0U; j < numCols; j++)
@@ -651,19 +1525,21 @@ arm_status arm_mat_inverse_f32(
}
}
/* Increment the temporary input pointer */
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
@@ -682,10 +1558,13 @@ arm_status arm_mat_inverse_f32(
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixInv group
@} end of MatrixInv group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_inverse_f64.c
* Description: Floating-point matrix inverse
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,53 +26,31 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
* @defgroup MatrixInv Matrix Inverse
*
* Computes the inverse of a matrix.
*
* The inverse is defined only if the input matrix is square and non-singular (the determinant
* is non-zero). The function checks that the input and output matrices are square and of the
* same size.
*
* Matrix inversion is numerically sensitive and the CMSIS DSP library only supports matrix
* inversion of floating-point matrices.
*
* \par Algorithm
* The Gauss-Jordan method is used to find the inverse.
* The algorithm performs a sequence of elementary row-operations until it
* reduces the input matrix to an identity matrix. Applying the same sequence
* of elementary row-operations to an identity matrix yields the inverse matrix.
* If the input matrix is singular, then the algorithm terminates and returns error status
* <code>ARM_MATH_SINGULAR</code>.
* \image html MatrixInverse.gif "Matrix Inverse of a 3 x 3 matrix using Gauss-Jordan Method"
*/
/**
* @addtogroup MatrixInv
* @{
*/
/**
* @brief Floating-point matrix inverse.
* @param[in] *pSrc points to input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns
* <code>ARM_MATH_SIZE_MISMATCH</code> if the input matrix is not square or if the size
* of the output matrix does not match the size of the input matrix.
* If the input matrix is found to be singular (non-invertible), then the function returns
* <code>ARM_MATH_SINGULAR</code>. Otherwise, the function returns <code>ARM_MATH_SUCCESS</code>.
@brief Floating-point (64 bit) matrix inverse.
@param[in] pSrc points to input matrix structure. The source matrix is modified by the function.
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
*/
arm_status arm_mat_inverse_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pDst)
arm_matrix_instance_f64 * pDst)
{
float64_t *pIn = pSrc->pData; /* input data matrix pointer */
float64_t *pOut = pDst->pData; /* output data matrix pointer */
@@ -83,64 +61,61 @@ arm_status arm_mat_inverse_f64(
uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
#if defined (ARM_MATH_DSP)
float64_t maxC; /* maximum value in the column */
/* Run the below code for Cortex-M4 and Cortex-M3 */
float64_t Xchg, in = 0.0f, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
float64_t Xchg, in = 0.0, in1; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
|| (pSrc->numRows != pDst->numRows))
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for column i is the greatest of the column.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is not the most significant of the columns, exchange that row with a row
* below it that does contain the most significant value in column i. If the most
* significant value of the column is zero, then an inverse to that matrix does not exist.
* The most significant value of the column is the absolute maximum.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _
* | a11 a12 | 1 0 | | X11 X12 |
* | | | = | |
* |_ a21 a22 | 0 1 _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
* Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
@@ -155,22 +130,22 @@ arm_status arm_mat_inverse_f64(
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f;
*pOutT1++ = 0.0;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0f;
*pOutT1++ = 1.0;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f;
*pOutT1++ = 0.0;
j--;
}
/* Decrement the loop counter */
/* Decrement loop counter */
rowCnt--;
}
@@ -199,41 +174,22 @@ arm_status arm_mat_inverse_f64(
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Grab the most significant value from column l */
maxC = 0;
for (i = l; i < numRows; i++)
{
maxC = *pInT1 > 0 ? (*pInT1 > maxC ? *pInT1 : maxC) : (-*pInT1 > maxC ? -*pInT1 : maxC);
pInT1 += numCols;
}
/* Update the status if the matrix is singular */
if (maxC == 0.0f)
{
return ARM_MATH_SINGULAR;
}
/* Restore pInT1 */
pInT1 = pIn;
/* Destination pointer modifier */
k = 1U;
/* Check if the pivot element is the most significant of the column */
if ( (in > 0.0f ? in : -in) != maxC)
/* Check if the pivot element is zero */
if (*pInT1 == 0.0)
{
/* Loop over the number rows present below */
i = numRows - (l + 1U);
while (i > 0U)
for (i = 1U; i < numRows - l; i++)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * l);
pOutT2 = pOutT1 + (numCols * k);
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * i);
/* Look for the most significant element to
/* Check if there is a non zero pivot element to
* replace in the rows below */
if ((*pInT2 > 0.0f ? *pInT2: -*pInT2) == maxC)
if (*pInT2 != 0.0)
{
/* Loop over number of columns
* to the right of the pilot element */
@@ -260,7 +216,7 @@ arm_status arm_mat_inverse_f64(
*pOutT2++ = *pOutT1;
*pOutT1++ = Xchg;
/* Decrement the loop counter */
/* Decrement loop counter */
j--;
}
@@ -271,16 +227,13 @@ arm_status arm_mat_inverse_f64(
break;
}
/* Update the destination pointer modifier */
k++;
/* Decrement the loop counter */
i--;
/* Decrement loop counter */
}
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0f))
if ((flag != 1U) && (in == 0.0))
{
return ARM_MATH_SINGULAR;
}
@@ -385,19 +338,19 @@ arm_status arm_mat_inverse_f64(
in1 = *pInT2;
*pInT2++ = in1 - (in * *pPRT_pDst++);
/* Decrement the loop counter */
/* Decrement loop counter */
j--;
}
}
/* Increment the temporary input pointer */
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
/* Decrement the loop counter */
/* Decrement loop counter */
k--;
/* Increment the pivot index */
/* Increment pivot index */
i++;
}
@@ -414,59 +367,60 @@ arm_status arm_mat_inverse_f64(
#else
/* Run the below code for Cortex-M0 */
float64_t Xchg, in = 0.0f; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
float64_t Xchg, in = 0.0; /* Temporary input values */
uint32_t i, rowCnt, flag = 0U, j, loopCnt, l; /* loop counters */
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
|| (pSrc->numRows != pDst->numRows))
if ((pSrc->numRows != pSrc->numCols) ||
(pDst->numRows != pDst->numCols) ||
(pSrc->numRows != pDst->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*--------------------------------------------------------------------------------------------------------------
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
* Matrix Inverse can be solved using elementary row operations.
*
* Gauss-Jordan Method:
*
* 1. First combine the identity matrix and the input matrix separated by a bar to form an
* augmented matrix as follows:
* _ _ _ _ _ _ _ _
* | | a11 a12 | | | 1 0 | | | X11 X12 |
* | | | | | | | = | |
* |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
*
* 2. In our implementation, pDst Matrix is used as identity matrix.
*
* 3. Begin with the first row. Let i = 1.
*
* 4. Check to see if the pivot for row i is zero.
* The pivot is the element of the main diagonal that is on the current row.
* For instance, if working with row i, then the pivot element is aii.
* If the pivot is zero, exchange that row with a row below it that does not
* contain a zero in column i. If this is not possible, then an inverse
* to that matrix does not exist.
*
* 5. Divide every element of row i by the pivot.
*
* 6. For every row below and row i, replace that row with the sum of that row and
* a multiple of row i so that each new element in column i below row i is zero.
*
* 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
* for every element below and above the main diagonal.
*
* 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
* Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
*----------------------------------------------------------------------------------------------------------------*/
/* Working pointer for destination matrix */
pOutT1 = pOut;
@@ -481,22 +435,22 @@ arm_status arm_mat_inverse_f64(
j = numRows - rowCnt;
while (j > 0U)
{
*pOutT1++ = 0.0f;
*pOutT1++ = 0.0;
j--;
}
/* Writing all ones in the diagonal of the destination matrix */
*pOutT1++ = 1.0f;
*pOutT1++ = 1.0;
/* Writing all zeroes in upper triangle of the destination matrix */
j = rowCnt - 1U;
while (j > 0U)
{
*pOutT1++ = 0.0f;
*pOutT1++ = 0.0;
j--;
}
/* Decrement the loop counter */
/* Decrement loop counter */
rowCnt--;
}
@@ -506,7 +460,7 @@ arm_status arm_mat_inverse_f64(
/* Index modifier to navigate through the columns */
l = 0U;
//for(loopCnt = 0U; loopCnt < numCols; loopCnt++)
while (loopCnt > 0U)
{
/* Check if the pivot element is zero..
@@ -525,22 +479,19 @@ arm_status arm_mat_inverse_f64(
/* Temporary variable to hold the pivot value */
in = *pInT1;
/* Destination pointer modifier */
k = 1U;
/* Check if the pivot element is zero */
if (*pInT1 == 0.0f)
if (*pInT1 == 0.0)
{
/* Loop over the number rows present below */
for (i = (l + 1U); i < numRows; i++)
for (i = 1U; i < numRows-l; i++)
{
/* Update the input and destination pointers */
pInT2 = pInT1 + (numCols * l);
pOutT2 = pOutT1 + (numCols * k);
pInT2 = pInT1 + (numCols * i);
pOutT2 = pOutT1 + (numCols * i);
/* Check if there is a non zero pivot element to
* replace in the rows below */
if (*pInT2 != 0.0f)
if (*pInT2 != 0.0)
{
/* Loop over number of columns
* to the right of the pilot element */
@@ -565,14 +516,12 @@ arm_status arm_mat_inverse_f64(
/* Break after exchange is done */
break;
}
/* Update the destination pointer modifier */
k++;
}
}
/* Update the status if the matrix is singular */
if ((flag != 1U) && (in == 0.0f))
if ((flag != 1U) && (in == 0.0))
{
return ARM_MATH_SINGULAR;
}
@@ -640,6 +589,7 @@ arm_status arm_mat_inverse_f64(
*pInT1 = *pInT1 - (in * *pPRT_in++);
pInT1++;
}
/* Loop over the number of columns to
replace the elements in the destination matrix */
for (j = 0U; j < numCols; j++)
@@ -651,30 +601,32 @@ arm_status arm_mat_inverse_f64(
}
}
/* Increment the temporary input pointer */
/* Increment temporary input pointer */
pInT1 = pInT1 + l;
}
/* Increment the input pointer */
pIn++;
/* Decrement the loop counter */
loopCnt--;
/* Increment the index modifier */
l++;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
if ((flag != 1U) && (in == 0.0f))
if ((flag != 1U) && (in == 0.0))
{
pIn = pSrc->pData;
for (i = 0; i < numRows * numCols; i++)
{
if (pIn[i] != 0.0f)
if (pIn[i] != 0.0)
break;
}
@@ -682,10 +634,11 @@ arm_status arm_mat_inverse_f64(
status = ARM_MATH_SINGULAR;
}
}
/* Return to application */
return (status);
}
/**
* @} end of MatrixInv group
@} end of MatrixInv group
*/

View File

@@ -0,0 +1,509 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_ldl_f32.c
* Description: Floating-point LDL decomposition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
/// @private
#define SWAP_ROWS_F32(A,i,j) \
{ \
int cnt = n; \
\
for(int w=0;w < n; w+=4) \
{ \
f32x4_t tmpa,tmpb; \
mve_pred16_t p0 = vctp32q(cnt); \
\
tmpa=vldrwq_z_f32(&A[i*n + w],p0);\
tmpb=vldrwq_z_f32(&A[j*n + w],p0);\
\
vstrwq_p(&A[i*n + w], tmpb, p0); \
vstrwq_p(&A[j*n + w], tmpa, p0); \
\
cnt -= 4; \
} \
}
/// @private
#define SWAP_COLS_F32(A,i,j) \
for(int w=0;w < n; w++) \
{ \
float32_t tmp; \
tmp = A[w*n + i]; \
A[w*n + i] = A[w*n + j];\
A[w*n + j] = tmp; \
}
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixChol
@{
*/
/**
* @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
* @param[in] pSrc points to the instance of the input floating-point matrix structure.
* @param[out] pl points to the instance of the output floating-point triangular matrix structure.
* @param[out] pd points to the instance of the output floating-point diagonal matrix structure.
* @param[out] pp points to the instance of the output floating-point permutation vector.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* @return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
* @par
* Computes the LDL^t decomposition of a matrix A such that P A P^t = L D L^t.
*/
arm_status arm_mat_ldlt_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pl,
arm_matrix_instance_f32 * pd,
uint16_t * pp)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pl->numRows != pl->numCols) ||
(pd->numRows != pd->numCols) ||
(pl->numRows != pd->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
const int n=pSrc->numRows;
int fullRank = 1, diag,k;
float32_t *pA;
memset(pd->pData,0,sizeof(float32_t)*n*n);
memcpy(pl->pData,pSrc->pData,n*n*sizeof(float32_t));
pA = pl->pData;
int cnt = n;
uint16x8_t vecP;
for(int k=0;k < n; k+=8)
{
mve_pred16_t p0;
p0 = vctp16q(cnt);
vecP = vidupq_u16((uint16_t)k, 1);
vstrhq_p(&pp[k], vecP, p0);
cnt -= 8;
}
for(k=0;k < n; k++)
{
/* Find pivot */
float32_t m=F32_MIN,a;
int j=k;
for(int r=k;r<n;r++)
{
if (pA[r*n+r] > m)
{
m = pA[r*n+r];
j = r;
}
}
if(j != k)
{
SWAP_ROWS_F32(pA,k,j);
SWAP_COLS_F32(pA,k,j);
}
pp[k] = j;
a = pA[k*n+k];
if (fabsf(a) < 1.0e-8f)
{
fullRank = 0;
break;
}
float32_t invA;
invA = 1.0f / a;
int32x4_t vecOffs;
int w;
vecOffs = vidupq_u32((uint32_t)0, 1);
vecOffs = vmulq_n_s32(vecOffs,n);
for(w=k+1; w<n; w+=4)
{
int cnt = n - k - 1;
f32x4_t vecX;
f32x4_t vecA;
f32x4_t vecW0,vecW1, vecW2, vecW3;
mve_pred16_t p0;
vecW0 = vdupq_n_f32(pA[(w + 0)*n+k]);
vecW1 = vdupq_n_f32(pA[(w + 1)*n+k]);
vecW2 = vdupq_n_f32(pA[(w + 2)*n+k]);
vecW3 = vdupq_n_f32(pA[(w + 3)*n+k]);
for(int x=k+1;x<n;x += 4)
{
p0 = vctp32q(cnt);
//pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], (uint32x4_t)vecOffs, p0);
vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
vecA = vldrwq_z_f32(&pA[(w + 0)*n+x],p0);
vecA = vfmsq_m(vecA, vecW0, vecX, p0);
vstrwq_p(&pA[(w + 0)*n+x], vecA, p0);
vecA = vldrwq_z_f32(&pA[(w + 1)*n+x],p0);
vecA = vfmsq_m(vecA, vecW1, vecX, p0);
vstrwq_p(&pA[(w + 1)*n+x], vecA, p0);
vecA = vldrwq_z_f32(&pA[(w + 2)*n+x],p0);
vecA = vfmsq_m(vecA, vecW2, vecX, p0);
vstrwq_p(&pA[(w + 2)*n+x], vecA, p0);
vecA = vldrwq_z_f32(&pA[(w + 3)*n+x],p0);
vecA = vfmsq_m(vecA, vecW3, vecX, p0);
vstrwq_p(&pA[(w + 3)*n+x], vecA, p0);
cnt -= 4;
}
}
for(; w<n; w++)
{
int cnt = n - k - 1;
f32x4_t vecA,vecX,vecW;
mve_pred16_t p0;
vecW = vdupq_n_f32(pA[w*n+k]);
for(int x=k+1;x<n;x += 4)
{
p0 = vctp32q(cnt);
//pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
vecA = vldrwq_z_f32(&pA[w*n+x],p0);
vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], (uint32x4_t)vecOffs, p0);
vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
vecA = vfmsq_m(vecA, vecW, vecX, p0);
vstrwq_p(&pA[w*n+x], vecA, p0);
cnt -= 4;
}
}
for(int w=k+1;w<n;w++)
{
pA[w*n+k] = pA[w*n+k] * invA;
}
}
diag=k;
if (!fullRank)
{
diag--;
for(int row=0; row < n;row++)
{
mve_pred16_t p0;
int cnt= n-k;
f32x4_t zero=vdupq_n_f32(0.0f);
for(int col=k; col < n;col += 4)
{
p0 = vctp32q(cnt);
vstrwq_p(&pl->pData[row*n+col], zero, p0);
cnt -= 4;
}
}
}
for(int row=0; row < n;row++)
{
mve_pred16_t p0;
int cnt= n-row-1;
f32x4_t zero=vdupq_n_f32(0.0f);
for(int col=row+1; col < n;col+=4)
{
p0 = vctp32q(cnt);
vstrwq_p(&pl->pData[row*n+col], zero, p0);
cnt -= 4;
}
}
for(int d=0; d < diag;d++)
{
pd->pData[d*n+d] = pl->pData[d*n+d];
pl->pData[d*n+d] = 1.0;
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
/// @private
#define SWAP_ROWS_F32(A,i,j) \
for(w=0;w < n; w++) \
{ \
float32_t tmp; \
tmp = A[i*n + w]; \
A[i*n + w] = A[j*n + w];\
A[j*n + w] = tmp; \
}
/// @private
#define SWAP_COLS_F32(A,i,j) \
for(w=0;w < n; w++) \
{ \
float32_t tmp; \
tmp = A[w*n + i]; \
A[w*n + i] = A[w*n + j];\
A[w*n + j] = tmp; \
}
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixChol
@{
*/
/**
* @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
* @param[in] pSrc points to the instance of the input floating-point matrix structure.
* @param[out] pl points to the instance of the output floating-point triangular matrix structure.
* @param[out] pd points to the instance of the output floating-point diagonal matrix structure.
* @param[out] pp points to the instance of the output floating-point permutation vector.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* @return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
* @par
* Computes the LDL^t decomposition of a matrix A such that P A P^t = L D L^t.
*/
arm_status arm_mat_ldlt_f32(
const arm_matrix_instance_f32 * pSrc,
arm_matrix_instance_f32 * pl,
arm_matrix_instance_f32 * pd,
uint16_t * pp)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pl->numRows != pl->numCols) ||
(pd->numRows != pd->numCols) ||
(pl->numRows != pd->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
const int n=pSrc->numRows;
int fullRank = 1, diag,k;
float32_t *pA;
int row,d;
memset(pd->pData,0,sizeof(float32_t)*n*n);
memcpy(pl->pData,pSrc->pData,n*n*sizeof(float32_t));
pA = pl->pData;
for(k=0;k < n; k++)
{
pp[k] = k;
}
for(k=0;k < n; k++)
{
/* Find pivot */
float32_t m=F32_MIN,a;
int j=k;
int r;
int w;
for(r=k;r<n;r++)
{
if (pA[r*n+r] > m)
{
m = pA[r*n+r];
j = r;
}
}
if(j != k)
{
SWAP_ROWS_F32(pA,k,j);
SWAP_COLS_F32(pA,k,j);
}
pp[k] = j;
a = pA[k*n+k];
if (fabsf(a) < 1.0e-8f)
{
fullRank = 0;
break;
}
for(w=k+1;w<n;w++)
{
int x;
for(x=k+1;x<n;x++)
{
pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * pA[x*n+k] / a;
}
}
for(w=k+1;w<n;w++)
{
pA[w*n+k] = pA[w*n+k] / a;
}
}
diag=k;
if (!fullRank)
{
diag--;
for(row=0; row < n;row++)
{
int col;
for(col=k; col < n;col++)
{
pl->pData[row*n+col]=0.0;
}
}
}
for(row=0; row < n;row++)
{
int col;
for(col=row+1; col < n;col++)
{
pl->pData[row*n+col] = 0.0;
}
}
for(d=0; d < diag;d++)
{
pd->pData[d*n+d] = pl->pData[d*n+d];
pl->pData[d*n+d] = 1.0;
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixChol group
*/

View File

@@ -0,0 +1,229 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_ldl_f64.c
* Description: Floating-point LDL decomposition
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
#include <math.h>
/// @private
#define SWAP_ROWS_F64(A,i,j) \
{ \
int w; \
for(w=0;w < n; w++) \
{ \
float64_t tmp; \
tmp = A[i*n + w]; \
A[i*n + w] = A[j*n + w];\
A[j*n + w] = tmp; \
} \
}
/// @private
#define SWAP_COLS_F64(A,i,j) \
{ \
int w; \
for(w=0;w < n; w++) \
{ \
float64_t tmp; \
tmp = A[w*n + i]; \
A[w*n + i] = A[w*n + j];\
A[w*n + j] = tmp; \
} \
}
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixChol
@{
*/
/**
* @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
* @param[in] pSrc points to the instance of the input floating-point matrix structure.
* @param[out] pl points to the instance of the output floating-point triangular matrix structure.
* @param[out] pd points to the instance of the output floating-point diagonal matrix structure.
* @param[out] pp points to the instance of the output floating-point permutation vector.
* @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
* @return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
- \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
* @par
* Computes the LDL^t decomposition of a matrix A such that P A P^t = L D L^t.
*/
arm_status arm_mat_ldlt_f64(
const arm_matrix_instance_f64 * pSrc,
arm_matrix_instance_f64 * pl,
arm_matrix_instance_f64 * pd,
uint16_t * pp)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pSrc->numCols) ||
(pl->numRows != pl->numCols) ||
(pd->numRows != pd->numCols) ||
(pl->numRows != pd->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
const int n=pSrc->numRows;
int fullRank = 1, diag,k;
float64_t *pA;
memset(pd->pData,0,sizeof(float64_t)*n*n);
memcpy(pl->pData,pSrc->pData,n*n*sizeof(float64_t));
pA = pl->pData;
for(k=0;k < n; k++)
{
pp[k] = k;
}
for(k=0;k < n; k++)
{
/* Find pivot */
float64_t m=F64_MIN,a;
int w,r,j=k;
for(r=k;r<n;r++)
{
if (pA[r*n+r] > m)
{
m = pA[r*n+r];
j = r;
}
}
if(j != k)
{
SWAP_ROWS_F64(pA,k,j);
SWAP_COLS_F64(pA,k,j);
}
pp[k] = j;
a = pA[k*n+k];
if (fabs(a) < 1.0e-18)
{
fullRank = 0;
break;
}
for(w=k+1;w<n;w++)
{
int x;
for(x=k+1;x<n;x++)
{
pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * pA[x*n+k] / a;
}
}
for(w=k+1;w<n;w++)
{
pA[w*n+k] = pA[w*n+k] / a;
}
}
diag=k;
if (!fullRank)
{
diag--;
{
int row;
for(row=0; row < n;row++)
{
int col;
for(col=k; col < n;col++)
{
pl->pData[row*n+col]=0.0;
}
}
}
}
{
int row;
for(row=0; row < n;row++)
{
int col;
for(col=row+1; col < n;col++)
{
pl->pData[row*n+col] = 0.0;
}
}
}
{
int d;
for(d=0; d < diag;d++)
{
pd->pData[d*n+d] = pl->pData[d*n+d];
pl->pData[d*n+d] = 1.0;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixChol group
*/

View File

@@ -0,0 +1,763 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_f16.c
* Description: Floating-point matrix multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
* @ingroup groupMatrix
*/
/**
* @addtogroup MatrixMult
* @{
*/
/**
* @brief Floating-point matrix multiplication.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_2x2_mve(
const arm_matrix_instance_f16 *pSrcA,
const arm_matrix_instance_f16 *pSrcB,
arm_matrix_instance_f16 *pDst)
{
static const uint16_t offsetA[8] = { 0, 0, 2, 2, 0, 0, 2, 2 };
/* offsetB allows to read and duplicate 1 row of B */
static const uint16_t offsetB[8] = { 0, 1, 0, 1, 0, 1, 0, 1 };
uint16x8_t vecOffsA, vecOffsB;
f16x8_t vecInA, vecInB, vecDst;
float16_t *pOut = pDst->pData; /* output data matrix pointer */
/*
* load initial offsets
*/
vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
/*
* load {a00 a00 a10 a10 x x x x }
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* load {b00 b01 b00 b01 x x x x }
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 a00 b01
* a10 b00 a10 b01
* x x
* x x }
*/
vecDst = vmulq(vecInA, vecInB);
/*
* move to 2nd column of matrix A
*/
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
/*
* load {a01 a01 a11 a11 x x x x}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 2);
/*
* load {b10, b11, b10, b11, x x x x }
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 + a01 b10 a00 b01 + a01 b11
* a10 b00 + a11 b10 a10 b01 + a11 b11
* x x
* x x }
*/
vecDst = vfmaq(vecDst, vecInA, vecInB);
mve_pred16_t p0 = vctp16q(2*2);
/*
* Store the result in the destination buffer
* (lower half of the vector)
*/
vstrhq_p(pOut, vecDst, p0);
return (ARM_MATH_SUCCESS);
}
__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
const arm_matrix_instance_f16 *pSrcA,
const arm_matrix_instance_f16 *pSrcB,
arm_matrix_instance_f16 *pDst)
{
static const uint16_t offsetA[8] = { 0, 0, 0, 3, 3, 3, 6, 6 };
/* offsetB allows to read and duplicate 1 row of B */
static const uint16_t offsetB[8] = { 0, 1, 2, 0, 1, 2, 0, 1 };
uint16x8_t vecOffsA, vecOffsB;
f16x8_t vecInA, vecInB, vecDst;
float16_t *pOut = pDst->pData; /* output data matrix pointer */
/*
* load initial offsets
*/
vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
/*
* load {a00 a00 a00 a10 a10 a10 a20 a20}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* load {b00 b01 b02 b00 b01 b02 b00 b01}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 a00 b01 a00 b02
* a10 b00 a10 b01 a10 b02
* a20 b00 a20 b01}
*/
vecDst = vmulq(vecInA, vecInB);
/*
* move to 2nd column of matrix A
*/
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
/*
* load {a01 a01 a01 a11 a11 a11 a21 a21}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
/*
* load {b10, b11, b12, b10, b11, b12, b10, b11}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 + a01 b10 a00 b01 + a01 b11 a00 b02 + a01 b12
* a10 b00 + a11 b10 a10 b01 + a11 b11 a10 b02 + a11 b12
* a20 b00 + a21 b10 a20 b01 + a21 b11 }
*/
vecDst = vfmaq(vecDst, vecInA, vecInB);
/*
* move to 3rd column of matrix A
*/
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
/*
* load {a02 a02 a02 a12 a12 a12 a22 a22}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
/*
* load {b20, b21, b22, b20, b21, b22, b20, b21}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* {a00 b00 + a01 b10 + a02 b20 a00 b01 + a01 b11 + a02 b21 a00 b02 + a01 b12 + a02 b22},
* a10 b00 + a11 b10 + a12 b20 a10 b01 + a11 b11 + a12 b21 a10 b02 + a11 b12 + a12 b22},
* a20 b00 + a21 b10 + a22 b20 a20 b01 + a21 b11 + a22 b21 }
*/
vecDst = vfmaq(vecDst, vecInA, vecInB);
/*
* Store the result in the destination buffer
*/
vst1q(pOut, vecDst); pOut += 8;
/* last element computed in scalar mode
* a20 b02 + a21 b12 + a22 b22
*/
_Float16 * pA = (_Float16 *)pSrcA->pData;
_Float16 * pB = (_Float16 *)pSrcB->pData;
*pOut = pA[2*3] * pB[2] + pA[2*3+1] * pB[3+2] + pA[2*3+2] * pB[2*3+2];
return (ARM_MATH_SUCCESS);
}
__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
const arm_matrix_instance_f16 *pSrcA,
const arm_matrix_instance_f16 *pSrcB,
arm_matrix_instance_f16 *pDst)
{
/* offsetA allows to read and duplicate 2 successive column elements of A */
static const uint16_t offsetA[8] = { 0, 0, 0, 0, 4, 4, 4, 4 };
/* offsetB allows to read and duplicate 1 row of B */
static const uint16_t offsetB[8] = { 0, 1, 2, 3, 0, 1, 2, 3 };
uint16x8_t vecOffsA, vecOffsB;
f16x8_t vecInA, vecInB, vecDst0, vecDst1;
float16_t *pOut = pDst->pData; /* output data matrix pointer */
/*
* load initial offsets
*/
vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
/*
* load {a00 a00 a00 a00 a10 a10 a10 a10}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* load {b00 b01 b02 b03 b00 b01 b02 b03}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 a00 b01 a00 b02 a00 b03
* a10 b00 a10 b01 a10 b02 a10 b03 }
*/
vecDst0 = vmulq(vecInA, vecInB);
/*
* jump 2 x A rows (2nd half of matrix)
*/
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
/*
* load {a20 a20 a20 a20 a30 a30 a30 a30}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* { a20 b00 a20 b01 a20 b02 a20 b03
* a30 b00 a30 b01 a30 b02 + a31 b12 }
*/
vecDst1 = vmulq(vecInA, vecInB);
/*
* rewind back to top half of the A matrix (2nd column)
*/
vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
/*
* load {a01 a01 a01 a01 a11 a11 a11 a11}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
/*
* load {b10, b11, b12, b13, b10, b11, b12, b13}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 + a01 b10 a00 b01 + a01 b11 a00 b02 + a01 b12 a00 b03 + a01 b13
* a10 b00 + a11 b10 a10 b01 + a11 b11 a10 b02 + a11 b12 a10 b03 + a11 b13 }
*/
vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
/*
* jump 2 x A rows (2nd half of matrix)
*/
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
/*
* load {a21 a21 a21 a21 a31 a31 a31 a31}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* {a20 b00 + a21 b10 a20 b01 + a21 b11 a20 b02 + a21 b12 a20 b03 + a21 b13
* a30 b00 + a31 b10 a30 b01 + a31 b11 a30 b02 + a31 b12 a30 b03 + a31 b13 }
*/
vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
/*
* rewind back to top half of the A matrix (3rd column)
*/
vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
/*
* load {a02 a02 a02 a02 a12 a12 a12 a12}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
/*
* load {b20, b21, b22, b23, b20, b21, b22, b23}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 + a01 b10 + a02 b20 a00 b01 + a01 b11 + a02 b21 a00 b02 + a01 b12 + a02 b22 a00 b03 + a01 b13 + a02 b23
* a10 b00 + a11 b10 + a12 b20 a10 b01 + a11 b11 + a12 b21 a10 b02 + a11 b12 + a12 b22 a10 b03 + a11 b13 + a12 b23 }
*/
vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
/*
* jump 2 x A rows
*/
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
/*
* load {a22 a22 a22 a22 a32 a32 a32 a32}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* {a20 b00 + a21 b10 + a22 b20 a20 b01 + a21 b11 + a22 b21 a20 b02 + a21 b12 + a22 b22 a20 b03 + a21 b13 + a22 b23
* a30 b00 + a31 b10 + a32 b20 a30 b01 + a31 b11 + a32 b21 a30 b02 + a31 b12 + a32 b22 a30 b03 + a31 b13 + a32 b23 }
*/
vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
/*
* rewind back to top half of the A matrix (4th column)
*/
vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
/*
* load {a03 a03 a03 a03 a13 a13 a13 a13}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* move to next B row
*/
vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
/*
* load {b30, b31, b32, b33, b30, b31, b32, b33}
*/
vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
/*
* { a00 b00 +...+ a03 b30, a00 b01 +...+ a03 b31, a00 b02 +...+ a03 b32, a00 b03 +...+ a03 b33
* a10 b00 +...+ a13 b30, a10 b01 +...+ a13 b31, a10 b02 +...+ a13 b32, a10 b03 +...+ a13 b33 }
*/
vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
/*
* jump 2 x A rows
*/
vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
/*
* load {a23 a23 a23 a23 a33 a33 a33 a33}
*/
vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
/*
* {a20 b00 +...+ a23 b30, a20 b01 +...+ a23 b31, a20 b02 +...+ a23 b32, a20 b03 +...+ a23 b33
* a30 b00 +...+ a33 b30, a30 b01 +...+ a33 b31, a30 b02 +...+ a33 b32, a30 b03 +...+ a33 b33 }
*/
vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
/*
* Store the result in the destination buffer
*/
vst1q(pOut, vecDst0); pOut += 8;
vst1q(pOut, vecDst1);
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
int numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
int numColsB = pSrcB->numCols; /* number of columns of input matrix B */
int numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t blkCnt; /* loop counters */
int i;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
return(ARM_MATH_SIZE_MISMATCH);
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if(numRowsA == 2)
return arm_mat_mult_f16_2x2_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 3)
return arm_mat_mult_f16_3x3_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 4)
return arm_mat_mult_f16_4x4_mve(pSrcA, pSrcB, pDst);
}
/* main loop process 4 rows */
i = numRowsA / 4;
while(i > 0)
{
float16_t *pInA0, *pInA1, *pInA2, *pInA3;
float16_t *pInB0;
float16_t *pOut0, *pOut1, *pOut2, *pOut3;
f16x8_t vecMac0, vecMac1, vecMac2, vecMac3;
f16x8_t vecInB;
/* pointers to 4 consecutive output rows */
pOut0 = pOut;
pOut1 = pOut0 + numColsB;
pOut2 = pOut1 + numColsB;
pOut3 = pOut2 + numColsB;
pInB0 = pInB;
int k = numColsB >> 3;
while(k > 0)
{
/* pointers to 4 consecutive Matrix A rows */
pInA0 = pInA;
pInA1 = pInA0 + numColsA;
pInA2 = pInA1 + numColsA;
pInA3 = pInA2 + numColsA;
vecMac0 = vdupq_n_f16(0.0f16);
vecMac1 = vdupq_n_f16(0.0f16);
vecMac2 = vdupq_n_f16(0.0f16);
vecMac3 = vdupq_n_f16(0.0f16);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3..., bi,4n+7}
*/
vecInB = *(f16x8_t *)pInB0; /* vldrhq_f16(pInB0, 0); */
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (4 x 8 block) in the destination buffer */
vst1q(pOut0, vecMac0); pOut0 += 8;
vst1q(pOut1, vecMac1); pOut1 += 8;
vst1q(pOut2, vecMac2); pOut2 += 8;
vst1q(pOut3, vecMac3); pOut3 += 8;
/*
* rewind
*/
pInB0 -= (numColsB * numColsA) - 8;
k--;
}
int colBLeft = numColsB & 7;
if (colBLeft)
{
pInA0 = pInA;
pInA1 = pInA0 + numColsA;
pInA2 = pInA1 + numColsA;
pInA3 = pInA2 + numColsA;
mve_pred16_t p0 = vctp16q(colBLeft);
vecMac0 = vdupq_n_f16(0.0f16);
vecMac1 = vdupq_n_f16(0.0f16);
vecMac2 = vdupq_n_f16(0.0f16);
vecMac3 = vdupq_n_f16(0.0f16);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, ..bi,4n+colBLeft-1, 0, ..}
*/
vecInB = vldrhq_z_f16(pInB0, p0);
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (4 x colBLeft block) in the destination buffer */
vstrhq_p_f16(pOut0, vecMac0, p0);
vstrhq_p_f16(pOut1, vecMac1, p0);
vstrhq_p_f16(pOut2, vecMac2, p0);
vstrhq_p_f16(pOut3, vecMac3, p0);
}
pInA += 4 * numColsA;
pOut += 4 * numColsB;
i--;
}
/*
* non multiple of 4 rows for Matrix A
* process single row
*/
if (numRowsA & 3)
{
i = numRowsA & 3;
do
{
float16_t *pInA0;
float16_t *pInB0;
float16_t *pOut0;
f16x8_t vecInB;
f16x8_t vecMac0;
pOut0 = pOut;
pInB0 = pInB;
int k = numColsB >> 3;
while(k > 0)
{
pInA0 = pInA;
vecMac0 = vdupq_n_f16(0.0f16);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3, ...bi,4n+7}
*/
vecInB = *(f16x8_t *)pInB0; /* vldrhq_f16(pInB0, 0); */
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (1 x 8 block) in the destination buffer */
vst1q(pOut0, vecMac0); pOut0 += 8;
/*
* rewind
*/
pInB0 -= (numColsB * numColsA) - 8;
k--;
}
int colBLeft = numColsB & 7;
if (colBLeft)
{
pInA0 = pInA;
mve_pred16_t p0 = vctp16q(colBLeft);
vecMac0 = vdupq_n_f16(0.0f16);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, ..., bi,4n+colBLeft, 0, ...}
*/
vecInB = vldrhq_z_f16(pInB0, p0);
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (1 x colBLeft block) in the destination buffer */
vstrhq_p_f16(pOut0, vecMac0, p0);
}
pInA += 1 * numColsA;
pOut += 1 * numColsB;
}
while (--i);
}
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
}
#else
arm_status arm_mat_mult_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float16_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float16_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float16_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
float16_t *pOut = pDst->pData; /* Output data matrix pointer */
float16_t *px; /* Temporary output data matrix pointer */
_Float16 sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0.0f16;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (_Float16)*pIn1++ * (_Float16)*pIn2;
pIn2 += numColsB;
sum += (_Float16)*pIn1++ * (_Float16)*pIn2;
pIn2 += numColsB;
sum += (_Float16)*pIn1++ * (_Float16)*pIn2;
pIn2 += numColsB;
sum += (_Float16)*pIn1++ * (_Float16)*pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (_Float16)*pIn1++ * (_Float16)*pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sum;
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixMult group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_mult_f32.c
* Description: Floating-point matrix multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,7 +26,11 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
#if defined(ARM_MATH_NEON)
#define GROUPOFROWS 8
#endif
/**
* @ingroup groupMatrix
@@ -54,15 +58,473 @@
* @{
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#define MATRIX_DIM3 3
#define MATRIX_DIM4 4
__STATIC_INLINE arm_status arm_mat_mult_f32_2x2_mve(
const arm_matrix_instance_f32 *pSrcA,
const arm_matrix_instance_f32 *pSrcB,
arm_matrix_instance_f32 *pDst)
{
/* {a00, a00, a10, a10} */
static const uint32_t offsetA0[4] = { 0, 0, 2, 2 };
/* {b00, b01, b00, b01} */
static const uint32_t offsetB0[4] = { 0, 1, 0, 1 };
/* {a01, a01, a11, a11} */
static const uint32_t offsetA1[4] = { 1, 1, 3, 3 };
/* {b10, b11, b10, b11} */
static const uint32_t offsetB1[4] = { 2, 3, 2, 3 };
uint32x4_t vecOffsA, vecOffsB;
f32x4_t vecInA, vecInB, vecDst;
vecOffsA = vldrwq_u32((uint32_t const *) offsetA0);
vecOffsB = vldrwq_u32((uint32_t const *) offsetB0);
vecInA = vldrwq_gather_shifted_offset((float32_t const *) pSrcA->pData, vecOffsA);
vecInB = vldrwq_gather_shifted_offset((float32_t const *) pSrcB->pData, vecOffsB);
vecDst = vmulq(vecInA, vecInB);
vecOffsA = vldrwq_u32((uint32_t const *) offsetA1);
vecOffsB = vldrwq_u32((uint32_t const *) offsetB1);
vecInA = vldrwq_gather_shifted_offset((float32_t const *) pSrcA->pData, vecOffsA);
vecInB = vldrwq_gather_shifted_offset((float32_t const *) pSrcB->pData, vecOffsB);
vecDst = vfmaq(vecDst, vecInA, vecInB);
vstrwq_f32(pDst->pData, vecDst);
return (ARM_MATH_SUCCESS);
}
/*
* A = {{a00, a01, a02},
* {a10, a11, a12},
* {a20, a21, a22}}
* B = {{b00, b01, b02},
* {b10, b11, b12},
* {b20, b21, b22}}
*
* Dst = {{a00 b00 + a01 b10 + a02 b20, a00 b01 + a01 b11 + a02 b21, a00 b02 + a01 b12 + a02 b22},
* {a10 b00 + a11 b10 + a12 b20, a10 b01 + a11 b11 + a12 b21, a10 b02 + a11 b12 + a12 b22},
* {a20 b00 + a21 b10 + a22 b20, a20 b01 + a21 b11 + a22 b21, a20 b02 + a21 b12 + a22 b22}}
*/
__STATIC_INLINE arm_status arm_mat_mult_f32_3x3_mve(
const arm_matrix_instance_f32 *pSrcA,
const arm_matrix_instance_f32 *pSrcB,
arm_matrix_instance_f32 *pDst)
{
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *pInA0, *pInA1, *pInA2;
f32x4_t vecMac0, vecMac1, vecMac2;
f32x4_t vecInB;
float32_t const *pSrBVec;
pSrBVec = (float32_t const *) pInB;
pInA0 = pInA;
pInA1 = pInA0 + MATRIX_DIM3;
pInA2 = pInA1 + MATRIX_DIM3;
/* enable predication to disable last (4th) vector element */
mve_pred16_t p0 = vctp32q(MATRIX_DIM3);
/*
* load {b0,0, b0,1, b0,2, 0}
*/
vecInB = vldrwq_z_f32(pSrBVec, p0);
pSrBVec += MATRIX_DIM3;
vecMac0 = vmulq(vecInB, *pInA0++);
vecMac1 = vmulq(vecInB, *pInA1++);
vecMac2 = vmulq(vecInB, *pInA2++);
/*
* load {b1,0, b1,1, b1,2, 0}
*/
vecInB = vldrwq_z_f32(pSrBVec, p0);
pSrBVec += MATRIX_DIM3;
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
/*
* load {b2,0, b2,1 , b2,2, 0}
*/
vecInB = vldrwq_z_f32(pSrBVec, p0);
pSrBVec += MATRIX_DIM3;
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
/* partial vector stores */
vstrwq_p_f32(pOut, vecMac0, p0);
pOut += MATRIX_DIM3;
vstrwq_p_f32(pOut, vecMac1, p0);
pOut += MATRIX_DIM3;
vstrwq_p_f32(pOut, vecMac2, p0);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_f32_4x4_mve(
const arm_matrix_instance_f32 *pSrcA,
const arm_matrix_instance_f32 *pSrcB,
arm_matrix_instance_f32 *pDst)
{
float32_t const *pSrBVec;
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
float32_t *pInA0, *pInA1, *pInA2, *pInA3;
f32x4_t vecMac0, vecMac1, vecMac2, vecMac3;
f32x4_t vecInB;
pSrBVec = (float32_t const *) pInB;
pInA0 = pInA;
pInA1 = pInA0 + MATRIX_DIM4;
pInA2 = pInA1 + MATRIX_DIM4;
pInA3 = pInA2 + MATRIX_DIM4;
/*
* load {b0,0, b0,1, b0,2, b0,3}
*/
vecInB = vld1q(pSrBVec);
pSrBVec += MATRIX_DIM4;
vecMac0 = vmulq(vecInB, *pInA0++);
vecMac1 = vmulq(vecInB, *pInA1++);
vecMac2 = vmulq(vecInB, *pInA2++);
vecMac3 = vmulq(vecInB, *pInA3++);
/*
* load {b1,0, b1,1, b1,2, b1,3}
*/
vecInB = vld1q(pSrBVec);
pSrBVec += MATRIX_DIM4;
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
/*
* load {b2,0, b2,1, b2,2, b2,3}
*/
vecInB = vld1q(pSrBVec);
pSrBVec += MATRIX_DIM4;
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
/*
* load {b3,0, b3,1, b3,2, b3,3}
*/
vecInB = vld1q(pSrBVec);
pSrBVec += MATRIX_DIM4;
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
vst1q(pOut, vecMac0);
pOut += MATRIX_DIM4;
vst1q(pOut, vecMac1);
pOut += MATRIX_DIM4;
vst1q(pOut, vecMac2);
pOut += MATRIX_DIM4;
vst1q(pOut, vecMac3);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
/**
* @brief Floating-point matrix multiplication.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
int numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
int numColsB = pSrcB->numCols; /* number of columns of input matrix B */
int numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t blkCnt; /* loop counters */
uint32_t i;
arm_status status;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if (numRowsA == 1)
{
pOut[0] = pInA[0] * pInB[0];
return(ARM_MATH_SUCCESS);
}
else if(numRowsA == 2)
return arm_mat_mult_f32_2x2_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 3)
return arm_mat_mult_f32_3x3_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 4)
return arm_mat_mult_f32_4x4_mve(pSrcA, pSrcB, pDst);
}
/* main loop process 4 rows */
i = numRowsA >> 2;
while (i > 0U)
{
float32_t *pInA0, *pInA1, *pInA2, *pInA3;
float32_t *pInB0;
float32_t *pOut0, *pOut1, *pOut2, *pOut3;
f32x4_t vecMac0, vecMac1, vecMac2, vecMac3;
f32x4_t vecInB;
/* pointers to 4 consecutive output rows */
pOut0 = pOut;
pOut1 = pOut0 + numColsB;
pOut2 = pOut1 + numColsB;
pOut3 = pOut2 + numColsB;
pInB0 = pInB;
uint32_t k = numColsB >> 2;
while (k > 0U)
{
/* pointers to 4 consecutive Matrix A rows */
pInA0 = pInA;
pInA1 = pInA0 + numColsA;
pInA2 = pInA1 + numColsA;
pInA3 = pInA2 + numColsA;
vecMac0 = vdupq_n_f32(0.0f);
vecMac1 = vdupq_n_f32(0.0f);
vecMac2 = vdupq_n_f32(0.0f);
vecMac3 = vdupq_n_f32(0.0f);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
*/
vecInB = *(f32x4_t *)pInB0; /* vldrwq_f32(pInB0, 0); */
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (4 x 4 block) in the destination buffer */
vst1q(pOut0, vecMac0);
pOut0 += 4;
vst1q(pOut1, vecMac1);
pOut1 += 4;
vst1q(pOut2, vecMac2);
pOut2 += 4;
vst1q(pOut3, vecMac3);
pOut3 += 4;
/*
* rewind
*/
pInB0 -= (numColsB * numColsA) - 4;
k--;
}
int colBLeft = numColsB & 3;
if (colBLeft)
{
pInA0 = pInA;
pInA1 = pInA0 + numColsA;
pInA2 = pInA1 + numColsA;
pInA3 = pInA2 + numColsA;
mve_pred16_t p0 = vctp32q(colBLeft);
vecMac0 = vdupq_n_f32(0.0f);
vecMac1 = vdupq_n_f32(0.0f);
vecMac2 = vdupq_n_f32(0.0f);
vecMac3 = vdupq_n_f32(0.0f);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
*/
vecInB = vldrwq_z_f32(pInB0, p0);
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (4 x colBLeft block) in the destination buffer */
vstrwq_p_f32(pOut0, vecMac0, p0);
vstrwq_p_f32(pOut1, vecMac1, p0);
vstrwq_p_f32(pOut2, vecMac2, p0);
vstrwq_p_f32(pOut3, vecMac3, p0);
}
/* move to next rows */
pInA += 4 * numColsA;
pOut += 4 * numColsB;
i--;
}
/*
* non multiple of 4 rows for Matrix A
* process single row
*/
if (numRowsA & 3)
{
i = numRowsA & 3;
while (i > 0U)
{
float32_t *pInA0;
float32_t *pInB0;
float32_t *pOut0;
f32x4_t vecInB;
f32x4_t vecMac0;
pOut0 = pOut;
pInB0 = pInB;
uint32_t k = numColsB >> 2;
while (k > 0U)
{
pInA0 = pInA;
vecMac0 = vdupq_n_f32(0.0f);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
*/
vecInB = *(f32x4_t *)pInB0; /* vldrwq_f32(pInB0, 0); */
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (1 x 4 block) in the destination buffer */
vst1q(pOut0, vecMac0);
pOut0 += 4;
/*
* rewind
*/
pInB0 -= (numColsB * numColsA) - 4;
k--;
}
int colBLeft = numColsB & 3;
if (colBLeft)
{
pInA0 = pInA;
mve_pred16_t p0 = vctp32q(colBLeft);
vecMac0 = vdupq_n_f32(0.0f);
blkCnt = numColsA;
while (blkCnt > 0U)
{
/*
* load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
*/
vecInB = vldrwq_z_f32(pInB0, p0);
vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
pInB0 = pInB0 + numColsB;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* Store the results (1 x colBLeft block) in the destination buffer */
vstrwq_p_f32(pOut0, vecMac0, p0);
}
/* move to next row */
pInA += 1 * numColsA;
pOut += 1 * numColsB;
i--;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON)
/**
* @brief Floating-point matrix multiplication.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
@@ -78,32 +540,224 @@ arm_status arm_mat_mult_f32(
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
float32_t in1, in2, in3, in4;
uint16_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
uint16_t col, i = 0U, j, row = numRowsA, rowCnt, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
float32x4_t a0V, a1V, a2V, a3V, a4V, a5V, a6V, a7V;
float32x4_t acc0,acc1,acc2,acc3,acc4,acc5,acc6,acc7,temp;
float32x2_t accum = vdup_n_f32(0);
float32_t *pIn1B = pSrcA->pData;
float32_t *pIn1C = pSrcA->pData;
float32_t *pIn1D = pSrcA->pData;
float32_t *pIn1E = pSrcA->pData;
float32_t *pIn1F = pSrcA->pData;
float32_t *pIn1G = pSrcA->pData;
float32_t *pIn1H = pSrcA->pData;
float32_t *pxB,*pxC, *pxD, *pxE, *pxF, *pxG, *pxH; /* Temporary output data matrix pointer */
float32_t sum0,sum1, sum2,sum3, sum4, sum5 , sum6, sum7;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
/* Row loop */
rowCnt = row >> 3;
while(rowCnt > 0)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + GROUPOFROWS*i;
pxB = px + numColsB;
pxC = px + 2*numColsB;
pxD = px + 3*numColsB;
pxE = px + 4*numColsB;
pxF = px + 5*numColsB;
pxG = px + 6*numColsB;
pxH = px + 7*numColsB;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* Column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum0 = 0.0f;
sum1 = 0.0f;
sum2 = 0.0f;
sum3 = 0.0f;
sum4 = 0.0f;
sum5 = 0.0f;
sum6 = 0.0f;
sum7 = 0.0f;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
pIn1B = pIn1 + numColsA;
pIn1C = pIn1 + 2*numColsA;
pIn1D = pIn1 + 3*numColsA;
pIn1E = pIn1 + 4*numColsA;
pIn1F = pIn1 + 5*numColsA;
pIn1G = pIn1 + 6*numColsA;
pIn1H = pIn1 + 7*numColsA;
acc0 = vdupq_n_f32(0.0);
acc1 = vdupq_n_f32(0.0);
acc2 = vdupq_n_f32(0.0);
acc3 = vdupq_n_f32(0.0);
acc4 = vdupq_n_f32(0.0);
acc5 = vdupq_n_f32(0.0);
acc6 = vdupq_n_f32(0.0);
acc7 = vdupq_n_f32(0.0);
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* Matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a0V = vld1q_f32(pIn1);
a1V = vld1q_f32(pIn1B);
a2V = vld1q_f32(pIn1C);
a3V = vld1q_f32(pIn1D);
a4V = vld1q_f32(pIn1E);
a5V = vld1q_f32(pIn1F);
a6V = vld1q_f32(pIn1G);
a7V = vld1q_f32(pIn1H);
pIn1 += 4;
pIn1B += 4;
pIn1C += 4;
pIn1D += 4;
pIn1E += 4;
pIn1F += 4;
pIn1G += 4;
pIn1H += 4;
temp = vsetq_lane_f32(*pIn2,temp,0);
pIn2 += numColsB;
temp = vsetq_lane_f32(*pIn2,temp,1);
pIn2 += numColsB;
temp = vsetq_lane_f32(*pIn2,temp,2);
pIn2 += numColsB;
temp = vsetq_lane_f32(*pIn2,temp,3);
pIn2 += numColsB;
acc0 = vmlaq_f32(acc0,a0V,temp);
acc1 = vmlaq_f32(acc1,a1V,temp);
acc2 = vmlaq_f32(acc2,a2V,temp);
acc3 = vmlaq_f32(acc3,a3V,temp);
acc4 = vmlaq_f32(acc4,a4V,temp);
acc5 = vmlaq_f32(acc5,a5V,temp);
acc6 = vmlaq_f32(acc6,a6V,temp);
acc7 = vmlaq_f32(acc7,a7V,temp);
/* Decrement the loop count */
colCnt--;
}
accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
sum0 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
sum1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2));
sum2 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3));
sum3 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(acc4), vget_high_f32(acc4));
sum4 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(acc5), vget_high_f32(acc5));
sum5 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(acc6), vget_high_f32(acc6));
sum6 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
accum = vpadd_f32(vget_low_f32(acc7), vget_high_f32(acc7));
sum7 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA & 3;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
sum0 += *pIn1++ * (*pIn2);
sum1 += *pIn1B++ * (*pIn2);
sum2 += *pIn1C++ * (*pIn2);
sum3 += *pIn1D++ * (*pIn2);
sum4 += *pIn1E++ * (*pIn2);
sum5 += *pIn1F++ * (*pIn2);
sum6 += *pIn1G++ * (*pIn2);
sum7 += *pIn1H++ * (*pIn2);
pIn2 += numColsB;
/* Decrement the loop counter */
colCnt--;
}
/* Store the result in the destination buffer */
*px++ = sum0;
*pxB++ = sum1;
*pxC++ = sum2;
*pxD++ = sum3;
*pxE++ = sum4;
*pxF++ = sum5;
*pxG++ = sum6;
*pxH++ = sum7;
/* Update the pointer pIn2 to point to the starting address of the next column */
j++;
pIn2 = pSrcB->pData + j;
/* Decrement the column loop counter */
col--;
} while (col > 0U);
/* Update the pointer pInA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + GROUPOFROWS*numColsA;
/* Decrement the row loop counter */
rowCnt--;
}
/*
i was the index of a group of rows computed by previous loop.
Now i is the index of a row since below code is computing row per row
and no more group of row per group of rows.
*/
i = GROUPOFROWS*i;
rowCnt = row & 7;
while(rowCnt > 0)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
@@ -117,7 +771,7 @@ arm_status arm_mat_mult_f32(
j = 0U;
/* column loop */
/* Column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
@@ -126,43 +780,43 @@ arm_status arm_mat_mult_f32(
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
pIn1 = pInA;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
acc0 = vdupq_n_f32(0.0);
/* Compute 4 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
/* Matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
in3 = *pIn2;
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
a0V = vld1q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
pIn1 += 4;
temp = vsetq_lane_f32(*pIn2,temp,0);
pIn2 += numColsB;
in1 = pIn1[0];
in2 = pIn1[1];
sum += in1 * in3;
in4 = *pIn2;
temp = vsetq_lane_f32(*pIn2,temp,1);
pIn2 += numColsB;
temp = vsetq_lane_f32(*pIn2,temp,2);
pIn2 += numColsB;
temp = vsetq_lane_f32(*pIn2,temp,3);
pIn2 += numColsB;
sum += in2 * in4;
in3 = *pIn2;
pIn2 += numColsB;
in1 = pIn1[2];
in2 = pIn1[3];
sum += in1 * in3;
in4 = *pIn2;
pIn2 += numColsB;
sum += in2 * in4;
pIn1 += 4U;
acc0 = vmlaq_f32(acc0,a0V,temp);
/* Decrement the loop count */
colCnt--;
}
accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
/* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
sum += *pIn1++ * (*pIn2);
pIn2 += numColsB;
@@ -182,40 +836,75 @@ arm_status arm_mat_mult_f32(
} while (col > 0U);
/* Update the pointer pInA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement the row loop counter */
rowCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
/* Run the below code for Cortex-M0 */
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
/**
* @brief Floating-point matrix multiplication.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_mult_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float32_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float32_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float32_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
float32_t *pOut = pDst->pData; /* Output data matrix pointer */
float32_t *px; /* Temporary output data matrix pointer */
float32_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pInA with each column in pInB */
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of the row being processed */
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, the column loop counter is to be initiated */
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
@@ -224,43 +913,78 @@ arm_status arm_mat_mult_f32(
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0.0f;
/* Initialize the pointer pIn1 to point to the starting address of the row being processed */
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
/* Matrix A columns number of MAC operations are to be performed */
colCnt = numColsA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
sum += *pIn1++ * (*pIn2);
/* c(m,p) = a(m,1) * b(1,p) + a(m,2) * b(2,p) + .... + a(m,n) * b(n,p) */
/* Perform the multiply-accumulates */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement the loop counter */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Store the result in the destination buffer */
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,p) = a(m,1) * b(1,p) + a(m,2) * b(2,p) + .... + a(m,n) * b(n,p) */
/* Perform the multiply-accumulates */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sum;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
/* Update the pointer pIn2 to point to the starting address of the next column */
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
#endif /* #if defined (ARM_MATH_DSP) */
/* Update the pointer pInA to point to the starting address of the next row */
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -269,6 +993,9 @@ arm_status arm_mat_mult_f32(
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixMult group
*/

View File

@@ -0,0 +1,202 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_f64.c
* Description: Floating-point matrix multiplication
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
*/
/**
* @defgroup MatrixMult Matrix Multiplication
*
* Multiplies two matrices.
*
* \image html MatrixMultiplication.gif "Multiplication of two 3 x 3 matrices"
* Matrix multiplication is only defined if the number of columns of the
* first matrix equals the number of rows of the second matrix.
* Multiplying an <code>M x N</code> matrix with an <code>N x P</code> matrix results
* in an <code>M x P</code> matrix.
* When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
* <code>pSrcA</code> and <code>pSrcB</code> are equal; and (2) that the size of the output
* matrix equals the outer dimensions of <code>pSrcA</code> and <code>pSrcB</code>.
*/
/**
* @addtogroup MatrixMult
* @{
*/
/**
* @brief Floating-point matrix multiplication.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*/
arm_status arm_mat_mult_f64(
const arm_matrix_instance_f64 * pSrcA,
const arm_matrix_instance_f64 * pSrcB,
arm_matrix_instance_f64 * pDst)
{
float64_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
float64_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
float64_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
float64_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
float64_t *pOut = pDst->pData; /* Output data matrix pointer */
float64_t *px; /* Temporary output data matrix pointer */
float64_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint64_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0.0;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Store result in destination buffer */
*px++ = sum;
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
* @} end of MatrixMult group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_mult_fast_q15.c
* Description: Q15 matrix multiplication (fast variant)
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,209 +26,168 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixMult
* @{
@addtogroup MatrixMult
@{
*/
/**
* @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @param[in] *pState points to the array for storing intermediate results
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The difference between the function arm_mat_mult_q15() and this fast variant is that
* the fast variant use a 32-bit rather than a 64-bit accumulator.
* The result of each 1.15 x 1.15 multiplication is truncated to
* 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
* format. Finally, the accumulator is saturated and converted to a 1.15 result.
*
* \par
* The fast version has the same overflow behavior as the standard version but provides
* less precision since it discards the low 16 bits of each multiplication result.
* In order to avoid overflows completely the input signals must be scaled down.
* Scale down one of the input matrices by log2(numColsA) bits to
* avoid overflows, as a total of numColsA additions are computed internally for each
* output element.
*
* \par
* See <code>arm_mat_mult_q15()</code> for a slower implementation of this function
* which uses 64-bit accumulation to provide higher precision.
@brief Q15 matrix multiplication (fast variant).
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@param[in] pState points to the array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The difference between the function \ref arm_mat_mult_q15() and this fast variant is that
the fast variant use a 32-bit rather than a 64-bit accumulator.
The result of each 1.15 x 1.15 multiplication is truncated to
2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
format. Finally, the accumulator is saturated and converted to a 1.15 result.
@par
The fast version has the same overflow behavior as the standard version but provides
less precision since it discards the low 16 bits of each multiplication result.
In order to avoid overflows completely the input signals must be scaled down.
Scale down one of the input matrices by log2(numColsA) bits to avoid overflows,
as a total of numColsA additions are computed internally for each output element.
@remark
Refer to \ref arm_mat_mult_q15() for a slower implementation of this function
which uses 64-bit accumulation to provide higher precision.
*/
arm_status arm_mat_mult_fast_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState)
arm_matrix_instance_q15 * pDst,
q15_t * pState)
{
q31_t sum; /* accumulator */
q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#ifndef UNALIGNED_SUPPORT_DISABLE
q31_t in; /* Temporary variable to hold the input value */
q31_t inA1, inA2, inB1, inB2;
q31_t sum2, sum3, sum4;
q15_t *pInA2, *pInB2, *px2;
uint32_t j = 0;
q31_t sum; /* Accumulator */
q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix B */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#if defined (ARM_MATH_DSP)
q31_t in; /* Temporary variable to hold the input value */
q31_t inA1, inB1, inA2, inB2;
q31_t sum2, sum3, sum4;
q15_t *pInA2, *pInB2, *px2;
uint32_t j = 0;
#else
q15_t in; /* Temporary variable to hold the input value */
q15_t inA1, inA2, inB1, inB2;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
q15_t in; /* Temporary variable to hold the input value */
q15_t inA1, inB1, inA2, inB2;
#endif /* #if defined (ARM_MATH_DSP) */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* Apply loop unrolling and exchange the columns with row elements */
col = numColsB >> 2;
/* The pointer px is set to starting address of the column being processed */
/* The pointer px is set to starting address of column being processed */
px = pSrcBT + i;
/* Apply loop unrolling and exchange columns with row elements */
col = numColsB >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
#ifndef UNALIGNED_SUPPORT_DISABLE
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
/* Unpack and store one element in the destination */
#if defined (ARM_MATH_DSP)
/* Read two elements from row */
in = read_q15x2_ia (&pInB);
/* Unpack and store one element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Unpack and store the second element in the destination */
/* Unpack and store second element in destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
/* Unpack and store one element in the destination */
in = read_q15x2_ia (&pInB);
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer px to point to the next row of the transposed matrix */
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
/* Unpack and store the second element in the destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
#else
/* Read one element from the row */
in = *pInB++;
/* Store one element in the destination */
*px = in;
/* Update the pointer px to point to the next row of the transposed matrix */
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
px += numRowsB;
/* Read one element from the row */
#else /* #if defined (ARM_MATH_DSP) */
/* Read one element from row */
in = *pInB++;
/* Store one element in the destination */
/* Store one element in destination */
*px = in;
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Read one element from the row */
in = *pInB++;
/* Store one element in the destination */
*px = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Read one element from the row */
in = *pInB++;
/* Store one element in the destination */
*px = in;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Decrement the column loop counter */
in = *pInB++;
*px = in;
px += numRowsB;
#endif /* #if defined (ARM_MATH_DSP) */
/* Decrement column loop counter */
col--;
}
@@ -238,31 +197,31 @@ arm_status arm_mat_mult_fast_q15(
while (col > 0U)
{
/* Read and store the input element in the destination */
/* Read and store input element in destination */
*px = *pInB++;
/* Update the pointer px to point to the next row of the transposed matrix */
/* Update pointer px to point to next row of transposed matrix */
px += numRowsB;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Reset the variables for the usage in the following multiplication process */
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
#ifndef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
/* Process two rows from matrix A at a time and output two rows at a time */
row = row >> 1;
row = row >> 1U;
px2 = px + numColsB;
#endif
@@ -270,29 +229,28 @@ arm_status arm_mat_mult_fast_q15(
/* row loop */
while (row > 0U)
{
/* For every row wise process, the column loop counter is to be initiated */
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the transposed pSrcB data */
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
#ifndef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
/* Process two (transposed) columns from matrix B at a time */
col = col >> 1;
col = col >> 1U;
j = 0;
#endif
/* column loop */
while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate the pointer pInA to point to the starting address of the column being processed */
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i;
#ifndef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
sum2 = 0;
sum3 = 0;
sum4 = 0;
@@ -300,57 +258,56 @@ arm_status arm_mat_mult_fast_q15(
pInA2 = pInA + numColsA;
pInB2 = pInB + numRowsB;
/* Read in two elements at once - alows dual MAC instruction */
colCnt = numColsA >> 1;
/* Read in two elements at once - allows dual MAC instruction */
colCnt = numColsA >> 1U;
#else
colCnt = numColsA >> 2;
colCnt = numColsA >> 2U;
#endif
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
#ifndef UNALIGNED_SUPPORT_DISABLE
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
inA1 = *__SIMD32(pInA)++;
inB1 = *__SIMD32(pInB)++;
inA2 = *__SIMD32(pInA2)++;
inB2 = *__SIMD32(pInB2)++;
#if defined (ARM_MATH_DSP)
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = read_q15x2_ia (&pInA);
inB1 = read_q15x2_ia (&pInB);
inA2 = read_q15x2_ia (&pInA2);
inB2 = read_q15x2_ia (&pInB2);
/* Multiply and Accumulates */
sum = __SMLAD(inA1, inB1, sum);
sum2 = __SMLAD(inA1, inB2, sum2);
sum3 = __SMLAD(inA2, inB1, sum3);
sum4 = __SMLAD(inA2, inB2, sum4);
#else
inA1 = *pInA;
inB1 = *pInB;
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = *pInA++;
inB1 = *pInB++;
/* Multiply and Accumulates */
sum += inA1 * inB1;
inA2 = pInA[1];
inB2 = pInB[1];
inA2 = *pInA++;
inB2 = *pInB++;
sum += inA2 * inB2;
inA1 = pInA[2];
inB1 = pInB[2];
inA1 = *pInA++;
inB1 = *pInB++;
sum += inA1 * inB1;
inA2 = pInA[3];
inB2 = pInB[3];
inA2 = *pInA++;
inB2 = *pInB++;
sum += inA2 * inB2;
#endif /* #if defined (ARM_MATH_DSP) */
pInA += 4;
pInB += 4;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
/* process odd column samples */
#ifndef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
if (numColsA & 1U) {
inA1 = *pInA++;
inB1 = *pInB++;
@@ -366,44 +323,45 @@ arm_status arm_mat_mult_fast_q15(
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
sum += (q31_t) (*pInA++) * (*pInB++);
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
sum += (q31_t) *pInA++ * *pInB++;
/* Decrement loop counter */
colCnt--;
}
#endif
#endif /* #if defined (ARM_MATH_DSP) */
/* Saturate and store the result in the destination buffer */
/* Saturate and store result in destination buffer */
*px++ = (q15_t) (sum >> 15);
#ifndef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
*px++ = (q15_t) (sum2 >> 15);
*px2++ = (q15_t) (sum3 >> 15);
*px2++ = (q15_t) (sum4 >> 15);
j += numRowsB * 2;
#endif
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
i = i + numColsA;
#ifndef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
i = i + numColsA;
px = px2 + (numColsB & 1U);
px2 = px + numColsB;
#endif
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
}
/* Compute any remaining odd row/column below */
#ifndef UNALIGNED_SUPPORT_DISABLE
#if defined (ARM_MATH_DSP)
/* Compute remaining output column */
if (numColsB & 1U) {
@@ -412,7 +370,7 @@ arm_status arm_mat_mult_fast_q15(
row = numRowsA & (~0x1);
/* Point to remaining unfilled column in output matrix */
px = pDst->pData+numColsB-1;
px = pDst->pData + numColsB-1;
pInA = pSrcA->pData;
/* row loop */
@@ -420,26 +378,26 @@ arm_status arm_mat_mult_fast_q15(
{
/* point to last column in matrix B */
pInB = pSrcBT + numRowsB*(numColsB-1);
pInB = pSrcBT + numRowsB * (numColsB-1);
/* Set the variable sum, that acts as accumulator, to zero */
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2;
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = *__SIMD32(pInA)++;
inA2 = *__SIMD32(pInA)++;
inB1 = *__SIMD32(pInB)++;
inB2 = *__SIMD32(pInB)++;
inA1 = read_q15x2_ia (&pInA);
inA2 = read_q15x2_ia (&pInA);
inB1 = read_q15x2_ia (&pInB);
inB2 = read_q15x2_ia (&pInB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
@@ -449,11 +407,11 @@ arm_status arm_mat_mult_fast_q15(
colCnt--;
}
/* Store the result in the destination buffer */
*px = (q15_t) (sum >> 15);
/* Store result in destination buffer */
*px = (q15_t) (sum >> 15);
px += numColsB;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
}
}
@@ -462,7 +420,7 @@ arm_status arm_mat_mult_fast_q15(
if (numRowsA & 1U) {
/* point to last row in output matrix */
px = pDst->pData+(numColsB)*(numRowsA-1);
px = pDst->pData + (numColsB) * (numRowsA-1);
pInB = pSrcBT;
col = numColsB;
@@ -471,48 +429,48 @@ arm_status arm_mat_mult_fast_q15(
/* col loop */
while (col > 0)
{
/* point to last row in matrix A */
pInA = pSrcA->pData + (numRowsA-1)*numColsA;
pInA = pSrcA->pData + (numRowsA-1) * numColsA;
/* Set the variable sum, that acts as accumulator, to zero */
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2;
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = *__SIMD32(pInA)++;
inA2 = *__SIMD32(pInA)++;
inB1 = *__SIMD32(pInB)++;
inB2 = *__SIMD32(pInB)++;
inA1 = read_q15x2_ia (&pInA);
inA2 = read_q15x2_ia (&pInA);
inB1 = read_q15x2_ia (&pInB);
inB2 = read_q15x2_ia (&pInB);
sum = __SMLAD(inA1, inB1, sum);
sum = __SMLAD(inA2, inB2, sum);
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
colCnt = numColsA & 3U;
colCnt = numColsA % 4U;
while (colCnt > 0U) {
sum += (q31_t) (*pInA++) * (*pInB++);
colCnt--;
}
/* Store the result in the destination buffer */
*px++ = (q15_t) (sum >> 15);
/* Store result in destination buffer */
*px++ = (q15_t) (sum >> 15);
/* Decrement the col loop counter */
/* Decrement column loop counter */
col--;
}
}
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
#endif /* #if defined (ARM_MATH_DSP) */
/* set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -521,5 +479,5 @@ arm_status arm_mat_mult_fast_q15(
}
/**
* @} end of MatrixMult group
@} end of MatrixMult group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_mult_fast_q31.c
* Description: Q31 matrix multiplication (fast variant)
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,229 +26,169 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixMult
* @{
@addtogroup MatrixMult
@{
*/
/**
* @brief Q31 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The difference between the function arm_mat_mult_q31() and this fast variant is that
* the fast variant use a 32-bit rather than a 64-bit accumulator.
* The result of each 1.31 x 1.31 multiplication is truncated to
* 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
* format. Finally, the accumulator is saturated and converted to a 1.31 result.
*
* \par
* The fast version has the same overflow behavior as the standard version but provides
* less precision since it discards the low 32 bits of each multiplication result.
* In order to avoid overflows completely the input signals must be scaled down.
* Scale down one of the input matrices by log2(numColsA) bits to
* avoid overflows, as a total of numColsA additions are computed internally for each
* output element.
*
* \par
* See <code>arm_mat_mult_q31()</code> for a slower implementation of this function
* which uses 64-bit accumulation to provide higher precision.
@brief Q31 matrix multiplication (fast variant).
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The difference between the function \ref arm_mat_mult_q31() and this fast variant is that
the fast variant use a 32-bit rather than a 64-bit accumulator.
The result of each 1.31 x 1.31 multiplication is truncated to
2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
format. Finally, the accumulator is saturated and converted to a 1.31 result.
@par
The fast version has the same overflow behavior as the standard version but provides
less precision since it discards the low 32 bits of each multiplication result.
In order to avoid overflows completely the input signals must be scaled down.
Scale down one of the input matrices by log2(numColsA) bits to avoid overflows,
as a total of numColsA additions are computed internally for each output element.
@remark
Refer to \ref arm_mat_mult_q31() for a slower implementation of this function
which uses 64-bit accumulation to provide higher precision.
*/
arm_status arm_mat_mult_fast_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
arm_matrix_instance_q31 * pDst)
{
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *px; /* Temporary output data matrix pointer */
q31_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
q31_t inA1, inB1;
#if defined (ARM_MATH_DSP)
q31_t sum2, sum3, sum4;
q31_t inA2, inB2;
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA2;
q31_t *px; /* Temporary output data matrix pointer */
q31_t *px2;
q31_t sum1, sum2, sum3, sum4; /* Accumulator */
q31_t inA1, inA2, inB1, inB2;
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
px = pDst->pData;
#if defined (ARM_MATH_DSP)
row = row >> 1;
row = row >> 1U;
px2 = px + numColsB;
#endif
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
while (row > 0U)
{
/* For every row wise process, the column loop counter is to be initiated */
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pInB = pSrcB->pData;
j = 0U;
#if defined (ARM_MATH_DSP)
col = col >> 1;
#endif
col = col >> 1U;
/* column loop */
while (col > 0U)
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate data pointers */
pInA = pSrcA->pData + i;
pInB = pSrcB->pData + j;
#if defined (ARM_MATH_DSP)
sum1 = 0;
sum2 = 0;
sum3 = 0;
sum4 = 0;
/* Initiate data pointers */
pInA = pSrcA->pData + i;
pInB = pSrcB->pData + j;
pInA2 = pInA + numColsA;
colCnt = numColsA;
#else
colCnt = numColsA >> 2;
#endif
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
#if defined (ARM_MATH_DSP)
inA1 = *pInA++;
inB1 = pInB[0];
inA2 = *pInA2++;
inB2 = pInB[1];
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum2 = __SMMLA(inA1, inB2, sum2);
sum3 = __SMMLA(inA2, inB1, sum3);
sum4 = __SMMLA(inA2, inB2, sum4);
#else
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* Perform the multiply-accumulates */
inB1 = *pInB;
pInB += numColsB;
inA1 = pInA[0];
sum = __SMMLA(inA1, inB1, sum);
inB1 = *pInB;
pInB += numColsB;
inA1 = pInA[1];
sum = __SMMLA(inA1, inB1, sum);
inB1 = *pInB;
pInB += numColsB;
inA1 = pInA[2];
sum = __SMMLA(inA1, inB1, sum);
inB1 = *pInB;
pInB += numColsB;
inA1 = pInA[3];
sum = __SMMLA(inA1, inB1, sum);
pInA += 4U;
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum2 = (q31_t) ((((q63_t) sum2 << 32) + ((q63_t) inA1 * inB2)) >> 32);
sum3 = (q31_t) ((((q63_t) sum3 << 32) + ((q63_t) inA2 * inB1)) >> 32);
sum4 = (q31_t) ((((q63_t) sum4 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
#ifdef ARM_MATH_CM0_FAMILY
/* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here. */
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
sum = __SMMLA(*pInA++, *pInB, sum);
pInB += numColsB;
colCnt--;
}
j++;
#endif
/* Convert the result from 2.30 to 1.31 format and store in destination buffer */
*px++ = sum << 1;
#if defined (ARM_MATH_DSP)
*px++ = sum1 << 1;
*px++ = sum2 << 1;
*px2++ = sum3 << 1;
*px2++ = sum4 << 1;
j += 2;
#endif
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
}
i = i + numColsA;
i = i + (numColsA << 1U);
px = px2 + (numColsB & 1U);
px2 = px + numColsB;
#if defined (ARM_MATH_DSP)
i = i + numColsA;
px = px2 + (numColsB & 1U);
px2 = px + numColsB;
#endif
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
}
/* Compute any remaining odd row/column below */
#if defined (ARM_MATH_DSP)
/* Compute remaining output column */
if (numColsB & 1U) {
/* Avoid redundant computation of last element */
row = numRowsA & (~0x1);
row = numRowsA & (~1U);
/* Point to remaining unfilled column in output matrix */
px = pDst->pData+numColsB-1;
px = pDst->pData + numColsB-1;
pInA = pSrcA->pData;
/* row loop */
@@ -258,49 +198,75 @@ arm_status arm_mat_mult_fast_q31(
/* point to last column in matrix B */
pInB = pSrcB->pData + numColsB-1;
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Set variable sum1, that acts as accumulator, to zero */
sum1 = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 columns at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
inA1 = *pInA++;
inA2 = *pInA++;
inB1 = *pInB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
sum = __SMMLA(inA2, inB2, sum);
inA1 = *pInA++;
inA2 = *pInA++;
inB1 = *pInB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
sum = __SMMLA(inA2, inB2, sum);
/* Decrement the loop counter */
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
/* Decrement loop counter */
colCnt--;
}
colCnt = numColsA & 3U;
/* Loop unrolling: Compute remaining column */
colCnt = numColsA % 4U;
#else
/* Initialize colCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U) {
sum = __SMMLA(*pInA++, *pInB, sum);
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
colCnt--;
}
/* Convert the result from 2.30 to 1.31 format and store in destination buffer */
*px = sum << 1;
*px = sum1 << 1;
px += numColsB;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
}
}
@@ -309,7 +275,7 @@ arm_status arm_mat_mult_fast_q31(
if (numRowsA & 1U) {
/* point to last row in output matrix */
px = pDst->pData+(numColsB)*(numRowsA-1);
px = pDst->pData + (numColsB) * (numRowsA-1);
col = numColsB;
i = 0U;
@@ -319,14 +285,16 @@ arm_status arm_mat_mult_fast_q31(
{
/* point to last row in matrix A */
pInA = pSrcA->pData + (numRowsA-1)*numColsA;
pInA = pSrcA->pData + (numRowsA-1) * numColsA;
pInB = pSrcB->pData + i;
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Set variable sum1, that acts as accumulator, to zero */
sum1 = 0;
/* Compute 4 columns at once */
colCnt = numColsA >> 2;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 columns at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
@@ -337,8 +305,13 @@ arm_status arm_mat_mult_fast_q31(
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
sum = __SMMLA(inA2, inB2, sum);
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum1 = __SMMLA(inA2, inB2, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
inA1 = *pInA++;
inA2 = *pInA++;
@@ -346,32 +319,49 @@ arm_status arm_mat_mult_fast_q31(
pInB += numColsB;
inB2 = *pInB;
pInB += numColsB;
sum = __SMMLA(inA1, inB1, sum);
sum = __SMMLA(inA2, inB2, sum);
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(inA1, inB1, sum1);
sum1 = __SMMLA(inA2, inB2, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA2 * inB2)) >> 32);
#endif
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
colCnt = numColsA & 3U;
/* Loop unrolling: Compute remaining column */
colCnt = numColsA % 4U;
#else
/* Initialize colCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U) {
sum = __SMMLA(*pInA++, *pInB, sum);
#if defined (ARM_MATH_DSP)
sum1 = __SMMLA(*pInA++, *pInB, sum1);
#else
sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
#endif
pInB += numColsB;
colCnt--;
}
/* Saturate and store the result in the destination buffer */
*px++ = sum << 1;
*px++ = sum1 << 1;
i++;
/* Decrement the col loop counter */
/* Decrement col loop counter */
col--;
}
}
#endif /* #if defined (ARM_MATH_DSP) */
/* set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -380,5 +370,5 @@ arm_status arm_mat_mult_fast_q31(
}
/**
* @} end of MatrixMult group
@} end of MatrixMult group
*/

View File

@@ -0,0 +1,784 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_opt_q31.c
* Description: Q31 matrix multiplication
*
* $Date: 3 Nov 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
@brief Q31 matrix multiplication.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@param[in] pState points to the array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The accumulator has a 2.62 format and maintains full precision of the intermediate
multiplication results but provides only a single guard bit. There is no saturation
on intermediate additions. Thus, if the accumulator overflows it wraps around and
distorts the result. The input signals should be scaled down to avoid intermediate
overflows. The input is thus scaled down by log2(numColsA) bits
to avoid overflows, as a total of numColsA additions are performed internally.
The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
@remark
Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function.
@remark
This function is a faster implementation of arm_mat_mult_q31 for MVE but it is requiring
additional storage for intermediate results.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#define MATRIX_DIM2 2
#define MATRIX_DIM3 3
#define MATRIX_DIM4 4
__STATIC_INLINE arm_status arm_mat_mult_opt_q31_2x2_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM2;
q63_t acc0, acc1;
q31x4_t vecB, vecA0, vecA1;
/* enable predication to disable half of vector elements */
mve_pred16_t p0 = vctp32q(MATRIX_DIM2);
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM2;
pInB = pSrcB->pData;
/* load 1st B column (partial load) */
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
/* load A rows */
vecA0 = vldrwq_s32(pInA0);
vecA1 = vldrwq_s32(pInA1);
acc0 = vrmlaldavhq(vecA0, vecB);
acc1 = vrmlaldavhq(vecA1, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
acc0 = vrmlaldavhq(vecA0, vecB);
acc1 = vrmlaldavhq(vecA1, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_opt_q31_3x3_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM3;
q31_t *pInA2 = pInA1 + MATRIX_DIM3;
q63_t acc0, acc1, acc2;
q31x4_t vecB, vecA;
/* enable predication to disable last (4th) vector element */
mve_pred16_t p0 = vctp32q(MATRIX_DIM3);
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM3;
pInB = pSrcB->pData;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_opt_q31_4x4_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM4;
q31_t *pInA2 = pInA1 + MATRIX_DIM4;
q31_t *pInA3 = pInA2 + MATRIX_DIM4;
q63_t acc0, acc1, acc2, acc3;
q31x4_t vecB, vecA;
vecColBOffs = vidupq_u32((uint32_t)0, 4);
pInB = pSrcB->pData;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_opt_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst,
q31_t *pState)
{
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA2;
q31_t *pInB2;
q31_t *px; /* Temporary output data matrix pointer */
q31_t *px2; /* Temporary output data matrix pointer */
uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
q31_t *pSrcBT = pState; /* input data matrix pointer for transpose */
uint32_t blkCnt; /* loop counters */
arm_status status; /* Status of matrix multiplication */
arm_matrix_instance_q31 BT;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols)) {
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
} else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if (numRowsA == 1)
{
q63_t sum = (q63_t) *pInA * *pInB;
pDst->pData[0] = (q31_t)(sum >> 31);
return (ARM_MATH_SUCCESS);
}
else if(numRowsA == 2)
return arm_mat_mult_opt_q31_2x2_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 3)
return arm_mat_mult_opt_q31_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_mult_opt_q31_4x4_mve(pSrcA, pSrcB, pDst);
}
/*
* Matrix transpose
*/
BT.numRows = numColsB;
BT.numCols = numRowsB;
BT.pData = pSrcBT;
arm_mat_trans_q31(pSrcB, &BT);
/*
* Reset the variables for the usage in the following multiplication process
*/
i = 0;
row = numRowsA >> 1;
px = pDst->pData;
px2 = px + numColsB;
/*
* main loop
* compute 2 x 2 output blocks
* with dot products (Matrix A rows * Transposed MAtrix B rows)
*/
while (row > 0u) {
/*
* For every row wise process, the column loop counter is to be initiated
* Compute 2 columns and 2 rows in parrallel
*/
col = numColsB >> 1;
j = 0;
/*
* column pair loop
*/
while (col > 0u) {
q31_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
q31x4_t vecA, vecA2, vecB, vecB2;
q63_t acc0, acc1, acc2, acc3;
/*
* Initiate the pointers
* - 2 x consecutive Matrix A rows (i increment is 2 x numColsA)
* - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
*/
pInA = pSrcA->pData + i;
pInA2 = pInA + numColsA;
pInB = pSrcBT + j;
pInB2 = pInB + numRowsB;
pSrcAVec = (q31_t const *) pInA;
pSrcA2Vec = (q31_t const *) pInA2;
pSrcBVec = (q31_t const *) pInB;
pSrcB2Vec = (q31_t const *) pInB2;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
/* load scheduling */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
blkCnt = (numColsA / 4);
while (blkCnt > 0U) {
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
vecA2 = vld1q(pSrcA2Vec);
pSrcA2Vec += 4;
acc1 = vrmlaldavhaq(acc1, vecA2, vecB);
vecB2 = vld1q(pSrcB2Vec);
pSrcB2Vec += 4;
acc2 = vrmlaldavhaq(acc2, vecA, vecB2);
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
acc3 = vrmlaldavhaq(acc3, vecA2, vecB2);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA & 3);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp32q(blkCnt);
vecB = vld1q(pSrcBVec);
acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
vecA2 = vld1q(pSrcA2Vec);
acc1 = vrmlaldavhaq_p(acc1, vecA2, vecB, p0);
vecB2 = vld1q(pSrcB2Vec);
acc2 = vrmlaldavhaq_p(acc2, vecA, vecB2, p0);
vecA = vld1q(pSrcAVec);
acc3 = vrmlaldavhaq_p(acc3, vecA2, vecB2, p0);
}
/* Convert to 1.31 */
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
/* Store the results (2 x 2 block) in the destination buffer */
*px++ = (q31_t) acc0;
*px++ = (q31_t) acc2;
*px2++ = (q31_t) acc1;
*px2++ = (q31_t) acc3;
j += numRowsB * 2;
/*
* Decrement the column pair loop counter
*/
col--;
}
i = i + numColsA * 2;
px = px2 + (numColsB & 1u);
px2 = px + numColsB;
/*
* Decrement the row pair loop counter
*/
row--;
}
/*
* Compute remaining row and/or column below
*/
if (numColsB & 1u) {
row = numRowsA & (~0x1); //avoid redundant computation
px = pDst->pData + numColsB - 1;
i = 0;
/*
* row loop
*/
while (row > 0) {
q31_t const *pSrcAVec, *pSrcBVec;
q31x4_t vecA, vecB;
q63_t acc0;
/*
* point to last column in matrix B
*/
pInB = pSrcBT + numRowsB * (numColsB - 1);
pInA = pSrcA->pData + i;
pSrcAVec = (q31_t const *) pInA;
pSrcBVec = (q31_t const *) pInB;
/* single dot-product */
acc0 = 0LL;
blkCnt = (numColsA / 4);
while (blkCnt > 0U) {
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA & 3);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
}
acc0 = asrl(acc0, 23);
*px = (q31_t) acc0;
px += numColsB;
i += numColsA;
/*
* Decrement the row loop counter
*/
row--;
}
}
if (numRowsA & 1u) {
col = numColsB;
i = 0u;
/*
* point to last row in output matrix
*/
px = pDst->pData + (numColsB) * (numRowsA - 1);
/*
* col loop
*/
while (col > 0) {
q31_t const *pSrcAVec, *pSrcBVec;
q31x4_t vecA, vecB;
q63_t acc0;
/*
* point to last row in matrix A
*/
pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
pInB = pSrcBT + i;
/*
* Set the variable sum, that acts as accumulator, to zero
*/
pSrcAVec = (q31_t const *) pInA;
pSrcBVec = (q31_t const *) pInB;
acc0 = 0LL;
blkCnt = (numColsA / 4);
while (blkCnt > 0U) {
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = (numColsA & 3);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vrmlaldavhaq_p(acc0, vecA, vecB, p0);
}
acc0 = asrl(acc0, 23);
*px++ = (q31_t) acc0;
i += numColsA;
/*
* Decrement the col loop counter
*/
col--;
}
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/*
* Return to application
*/
return (status);
}
#else
arm_status arm_mat_mult_opt_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst,
q31_t *pState)
{
q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
q63_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
(void)pState;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Convert result from 2.62 to 1.31 format and store in destination buffer */
*px++ = (q31_t) (sum >> 31);
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixMult group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_mult_q15.c
* Description: Q15 matrix multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 3 Nov 2021
* $Revision: V1.10.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,236 +26,643 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixMult
* @{
@addtogroup MatrixMult
@{
*/
/**
* @brief Q15 matrix multiplication
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @param[in] *pState points to the array for storing intermediate results (Unused)
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 64-bit internal accumulator. The inputs to the
* multiplications are in 1.15 format and multiplications yield a 2.30 result.
* The 2.30 intermediate
* results are accumulated in a 64-bit accumulator in 34.30 format. This approach
* provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
* truncated to 34.15 format by discarding the low 15 bits and then saturated to
* 1.15 format.
*
* \par
* Refer to <code>arm_mat_mult_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
*
*/
@brief Q15 matrix multiplication.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@param[in] pState points to the array for storing intermediate results
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator. The inputs to the
multiplications are in 1.15 format and multiplications yield a 2.30 result.
The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
This approach provides 33 guard bits and there is no risk of overflow.
The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits
and then saturated to 1.15 format.
@par
Refer to \ref arm_mat_mult_fast_q15() for a faster but less precise version of this function.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
#define MATRIX_DIM2 2
#define MATRIX_DIM3 3
#define MATRIX_DIM4 4
__STATIC_INLINE arm_status arm_mat_mult_q15_2x2_mve(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs;
q15_t *pInA0 = pInA;
q15_t *pInA1 = pInA0 + MATRIX_DIM2;
q63_t acc0, acc1;
q15x8_t vecB, vecA0, vecA1;
mve_pred16_t p0 = vctp16q(MATRIX_DIM2);
vecColBOffs = vidupq_u16((uint32_t)0, 2); /* MATRIX_DIM2 */
pInB = pSrcB->pData;
vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
vecA0 = vldrhq_s16(pInA0);
vecA1 = vldrhq_s16(pInA1);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
pOut[0 * MATRIX_DIM2] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM2] = (q15_t) __SSAT(acc1, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
pOut[0 * MATRIX_DIM2] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM2] = (q15_t) __SSAT(acc1, 16);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_q15_3x3_mve(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs;
q15_t *pInA0 = pInA;
q15_t *pInA1 = pInA0 + MATRIX_DIM3;
q15_t *pInA2 = pInA1 + MATRIX_DIM3;
q63_t acc0, acc1, acc2;
q15x8_t vecB, vecA0, vecA1, vecA2;
mve_pred16_t p0 = vctp16q(MATRIX_DIM3);
vecColBOffs = vidupq_u16((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM3;
pInB = pSrcB->pData;
vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
vecA0 = vldrhq_s16(pInA0);
vecA1 = vldrhq_s16(pInA1);
vecA2 = vldrhq_s16(pInA2);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_q15_4x4_mve(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint16x8_t vecColBOffs;
q15_t *pInA0 = pInA;
q15_t *pInA1 = pInA0 + MATRIX_DIM4;
q15_t *pInA2 = pInA1 + MATRIX_DIM4;
q15_t *pInA3 = pInA2 + MATRIX_DIM4;
q63_t acc0, acc1, acc2, acc3;
q15x8_t vecB, vecA0, vecA1, vecA2, vecA3;
mve_pred16_t p0 = vctp16q(MATRIX_DIM4);
vecColBOffs = vidupq_u16((uint32_t)0, 4);
pInB = pSrcB->pData;
vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
vecA0 = vldrhq_s16(pInA0);
vecA1 = vldrhq_s16(pInA1);
vecA2 = vldrhq_s16(pInA2);
vecA3 = vldrhq_s16(pInA3);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc3 = vmlaldavq(vecA3, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
acc3 = asrl(acc3, 15);
pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc3 = vmlaldavq(vecA3, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
acc3 = asrl(acc3, 15);
pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc3 = vmlaldavq(vecA3, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
acc3 = asrl(acc3, 15);
pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
acc0 = vmlaldavq(vecA0, vecB);
acc1 = vmlaldavq(vecA1, vecB);
acc2 = vmlaldavq(vecA2, vecB);
acc3 = vmlaldavq(vecA3, vecB);
acc0 = asrl(acc0, 15);
acc1 = asrl(acc1, 15);
acc2 = asrl(acc2, 15);
acc3 = asrl(acc3, 15);
pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState)
{
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pInA2;
q15_t *pInB2;
q15_t *px; /* Temporary output data matrix pointer */
q15_t *px2; /* Temporary output data matrix pointer */
uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */
uint32_t blkCnt; /* loop counters */
arm_status status; /* Status of matrix multiplication */
arm_matrix_instance_q15 BT;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif
{
/* small squared matrix specialized routines */
if (numRowsA == numColsB && numColsB == numColsA) {
if (numRowsA == 1) {
q63_t sum;
sum = pInA[0] * pInB[0];
pDst->pData[0] = (q15_t) __SSAT((sum >> 15), 16);
return (ARM_MATH_SUCCESS);
} else if (numRowsA == 2)
return arm_mat_mult_q15_2x2_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 3)
return arm_mat_mult_q15_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_mult_q15_4x4_mve(pSrcA, pSrcB, pDst);
}
/*
* Matrix transpose
*/
BT.numRows = numColsB;
BT.numCols = numRowsB;
BT.pData = pSrcBT;
arm_mat_trans_q15(pSrcB, &BT);
/*
* Reset the variables for the usage in the following multiplication process
*/
i = 0;
row = numRowsA >> 1;
px = pDst->pData;
px2 = px + numColsB;
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
while (row > 0u) {
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB >> 1;
/*
* For every row wise process, the pIn2 pointer is set
* to the starting address of the transposed pSrcB data
*/
pInB = pSrcBT;
pInB2 = pInB + numRowsB;
j = 0;
/*
* column loop
*/
while (col > 0u) {
q15_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
q15x8_t vecA, vecA2, vecB, vecB2;
q63_t acc0, acc1, acc2, acc3;
/*
* Initiate the pointer pIn1 to point to the starting address of the column being processed
*/
pInA = pSrcA->pData + i;
pInA2 = pInA + numColsA;
pInB = pSrcBT + j;
pInB2 = pInB + numRowsB;
pSrcAVec = (q15_t const *) pInA;
pSrcA2Vec = (q15_t const *) pInA2;
pSrcBVec = (q15_t const *) pInB;
pSrcB2Vec = (q15_t const *) pInB2;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
blkCnt = numColsA / 8;
while (blkCnt > 0U) {
vecB = vld1q(pSrcBVec);
pSrcBVec += 8;
acc0 = vmlaldavaq(acc0, vecA, vecB);
vecA2 = vld1q(pSrcA2Vec);
pSrcA2Vec += 8;
acc1 = vmlaldavaq(acc1, vecA2, vecB);
vecB2 = vld1q(pSrcB2Vec);
pSrcB2Vec += 8;
acc2 = vmlaldavaq(acc2, vecA, vecB2);
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
acc3 = vmlaldavaq(acc3, vecA2, vecB2);
blkCnt--;
}
/*
* tail
*/
blkCnt = numColsA & 7;
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
vecB = vld1q(pSrcBVec);
acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
vecA2 = vld1q(pSrcA2Vec);
acc1 = vmlaldavaq_p(acc1, vecA2, vecB, p0);
vecB2 = vld1q(pSrcB2Vec);
acc2 = vmlaldavaq_p(acc2, vecA, vecB2, p0);
vecA = vld1q(pSrcAVec);
acc3 = vmlaldavaq_p(acc3, vecA2, vecB2, p0);
}
*px++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
*px++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
*px2++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
*px2++ = (q15_t) MVE_ASRL_SAT16(acc3, 15);
j += numRowsB * 2;
/*
* Decrement the column loop counter
*/
col--;
}
i = i + numColsA * 2;
px = px2 + (numColsB & 1u);
px2 = px + numColsB;
/*
* Decrement the row loop counter
*/
row--;
}
/*
* Compute remaining row and/or column below
*/
if (numColsB & 1u) {
row = numRowsA & (~0x1); //avoid redundant computation
px = pDst->pData + numColsB - 1;
i = 0;
/*
* row loop
*/
while (row > 0) {
q15_t const *pSrcAVec, *pSrcBVec;
q15x8_t vecA, vecB;
q63_t acc0;
/*
* point to last column in matrix B
*/
pInB = pSrcBT + numRowsB * (numColsB - 1);
pInA = pSrcA->pData + i;
pSrcAVec = (q15_t const *) pInA;
pSrcBVec = (q15_t const *) pInB;
acc0 = 0LL;
blkCnt = (numColsA) / 8;
while (blkCnt > 0U) {
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vld1q(pSrcBVec);
pSrcBVec += 8;
acc0 = vmlaldavaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA & 7);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
}
*px = (q15_t) MVE_ASRL_SAT16(acc0, 15);
px += numColsB;
i += numColsA;
/*
* Decrement the row loop counter
*/
row--;
}
}
if (numRowsA & 1u) {
col = numColsB;
i = 0u;
/*
* point to last row in output matrix
*/
px = pDst->pData + (numColsB) * (numRowsA - 1);
/*
* col loop
*/
while (col > 0) {
q15_t const *pSrcAVec, *pSrcBVec;
q15x8_t vecA, vecB;
q63_t acc0;
/*
* point to last row in matrix A
*/
pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
pInB = pSrcBT + i;
/*
* Set the variable sum, that acts as accumulator, to zero
*/
pSrcAVec = (q15_t const *) pInA;
pSrcBVec = (q15_t const *) pInB;
acc0 = 0LL;
blkCnt = ((numColsA) / 8);
while (blkCnt > 0U) {
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vld1q(pSrcBVec);
pSrcBVec += 8;
acc0 = vmlaldavaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
*/
blkCnt = (numColsA & 7);
if (blkCnt > 0U) {
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
}
*px++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
i += numColsA;
/*
* Decrement the col loop counter
*/
col--;
}
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_mult_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst,
q15_t * pState)
arm_matrix_instance_q15 * pDst,
q15_t * pState)
{
q63_t sum; /* accumulator */
q63_t sum; /* Accumulator */
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_DSP) /* != CM0 */
/* Run the below code for Cortex-M4 and Cortex-M3 */
q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix B */
uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint16_t col, i = 0U, row = numRowsB, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#ifndef UNALIGNED_SUPPORT_DISABLE
q31_t in; /* Temporary variable to hold the input value */
q31_t pSourceA1, pSourceB1, pSourceA2, pSourceB2;
#else
q15_t in; /* Temporary variable to hold the input value */
q15_t inA1, inB1, inA2, inB2;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
q31_t inA1, inB1, inA2, inB2;
arm_matrix_instance_q15 BT;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Matrix transpose */
do
{
/* Apply loop unrolling and exchange the columns with row elements */
col = numColsB >> 2;
/* The pointer px is set to starting address of the column being processed */
px = pSrcBT + i;
BT.numRows = numColsB;
BT.numCols = numRowsB;
BT.pData = pSrcBT;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (col > 0U)
{
#ifndef UNALIGNED_SUPPORT_DISABLE
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
/* Unpack and store one element in the destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Unpack and store the second element in the destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Read two elements from the row */
in = *__SIMD32(pInB)++;
/* Unpack and store one element in the destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) in;
#else
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Unpack and store the second element in the destination */
#ifndef ARM_MATH_BIG_ENDIAN
*px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
#else
*px = (q15_t) in;
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
#else
/* Read one element from the row */
in = *pInB++;
/* Store one element in the destination */
*px = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Read one element from the row */
in = *pInB++;
/* Store one element in the destination */
*px = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Read one element from the row */
in = *pInB++;
/* Store one element in the destination */
*px = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Read one element from the row */
in = *pInB++;
/* Store one element in the destination */
*px = in;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
/* Decrement the column loop counter */
col--;
}
/* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
col = numColsB % 0x4U;
while (col > 0U)
{
/* Read and store the input element in the destination */
*px = *pInB++;
/* Update the pointer px to point to the next row of the transposed matrix */
px += numRowsB;
/* Decrement the column loop counter */
col--;
}
i++;
/* Decrement the row loop counter */
row--;
} while (row > 0U);
/* Reset the variables for the usage in the following multiplication process */
arm_mat_trans_q15(pSrcB,&BT);
/* Reset variables for usage in following multiplication process */
row = numRowsA;
i = 0U;
px = pDst->pData;
@@ -264,123 +671,99 @@ arm_status arm_mat_mult_q15(
/* row loop */
do
{
/* For every row wise process, the column loop counter is to be initiated */
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the transposed pSrcB data */
/* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
pInB = pSrcBT;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
/* Set variable sum, that acts as accumulator, to zero */
sum = 0;
/* Apply loop unrolling and compute 2 MACs simultaneously. */
colCnt = numColsA >> 2;
/* Initiate the pointer pIn1 to point to the starting address of the column being processed */
/* Initiate pointer pInA to point to starting address of column being processed */
pInA = pSrcA->pData + i;
/* Apply loop unrolling and compute 2 MACs simultaneously. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
#ifndef UNALIGNED_SUPPORT_DISABLE
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* read real and imag values from pSrcA and pSrcB buffer */
pSourceA1 = *__SIMD32(pInA)++;
pSourceB1 = *__SIMD32(pInB)++;
inA1 = read_q15x2_ia (&pInA);
inB1 = read_q15x2_ia (&pInB);
pSourceA2 = *__SIMD32(pInA)++;
pSourceB2 = *__SIMD32(pInB)++;
inA2 = read_q15x2_ia (&pInA);
inB2 = read_q15x2_ia (&pInB);
/* Multiply and Accumlates */
sum = __SMLALD(pSourceA1, pSourceB1, sum);
sum = __SMLALD(pSourceA2, pSourceB2, sum);
/* Multiply and Accumulates */
sum = __SMLALD(inA1, inB1, sum);
sum = __SMLALD(inA2, inB2, sum);
#else
/* read real and imag values from pSrcA and pSrcB buffer */
inA1 = *pInA++;
inB1 = *pInB++;
inA2 = *pInA++;
/* Multiply and Accumlates */
sum += inA1 * inB1;
inB2 = *pInB++;
inA1 = *pInA++;
inB1 = *pInB++;
/* Multiply and Accumlates */
sum += inA2 * inB2;
inA2 = *pInA++;
inB2 = *pInB++;
/* Multiply and Accumlates */
sum += inA1 * inB1;
sum += inA2 * inB2;
#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
/* process remaining column samples */
colCnt = numColsA & 3U;
colCnt = numColsA % 0x4U;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
sum += *pInA++ * *pInB++;
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
/* Saturate and store the result in the destination buffer */
/* Saturate and store result in destination buffer */
*px = (q15_t) (__SSAT((sum >> 15), 16));
px++;
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
} while (col > 0U);
i = i + numColsA;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
#else
#else /* #if defined (ARM_MATH_DSP) */
/* Run the below code for Cortex-M0 */
q15_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
q15_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
q15_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q15_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
q15_t *pOut = pDst->pData; /* Output data matrix pointer */
q15_t *px; /* Temporary output data matrix pointer */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
(void)pState;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
@@ -391,11 +774,10 @@ arm_status arm_mat_mult_q15(
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
/* For every row wise process, the column loop counter is to be initiated */
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
@@ -404,7 +786,7 @@ arm_status arm_mat_mult_q15(
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate the pointer pIn1 to point to the starting address of pSrcA */
/* Initiate pointer pIn1 to point to starting address of pSrcA */
pIn1 = pInA;
/* Matrix A columns number of MAC operations are to be performed */
@@ -413,45 +795,49 @@ arm_status arm_mat_mult_q15(
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* Perform the multiply-accumulates */
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform multiply-accumulates */
sum += (q31_t) * pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement the loop counter */
/* Decrement loop counter */
colCnt--;
}
/* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
/* Saturate and store the result in the destination buffer */
/* Convert result from 34.30 to 1.15 format and store saturated value in destination buffer */
/* Saturate and store result in destination buffer */
*px++ = (q15_t) __SSAT((sum >> 15), 16);
/* Decrement the column loop counter */
/* Decrement column loop counter */
col--;
/* Update the pointer pIn2 to point to the starting address of the next column */
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update the pointer pSrcA to point to the starting address of the next row */
/* Update pointer pSrcA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement the row loop counter */
/* Decrement row loop counter */
row--;
} while (row > 0U);
#endif /* #if defined (ARM_MATH_DSP) */
/* set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixMult group
@} end of MatrixMult group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_mult_q31.c
* Description: Q31 matrix multiplication
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,257 +26,736 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixMult
* @{
@addtogroup MatrixMult
@{
*/
/**
* @brief Q31 matrix multiplication
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using an internal 64-bit accumulator.
* The accumulator has a 2.62 format and maintains full precision of the intermediate
* multiplication results but provides only a single guard bit. There is no saturation
* on intermediate additions. Thus, if the accumulator overflows it wraps around and
* distorts the result. The input signals should be scaled down to avoid intermediate
* overflows. The input is thus scaled down by log2(numColsA) bits
* to avoid overflows, as a total of numColsA additions are performed internally.
* The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
*
* \par
* See <code>arm_mat_mult_fast_q31()</code> for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
*
@brief Q31 matrix multiplication.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function is implemented using an internal 64-bit accumulator.
The accumulator has a 2.62 format and maintains full precision of the intermediate
multiplication results but provides only a single guard bit. There is no saturation
on intermediate additions. Thus, if the accumulator overflows it wraps around and
distorts the result. The input signals should be scaled down to avoid intermediate
overflows. The input is thus scaled down by log2(numColsA) bits
to avoid overflows, as a total of numColsA additions are performed internally.
The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
@remark
Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
#define MATRIX_DIM2 2
#define MATRIX_DIM3 3
#define MATRIX_DIM4 4
__STATIC_INLINE arm_status arm_mat_mult_q31_2x2_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM2;
q63_t acc0, acc1;
q31x4_t vecB, vecA0, vecA1;
/* enable predication to disable half of vector elements */
mve_pred16_t p0 = vctp32q(MATRIX_DIM2);
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM2;
pInB = pSrcB->pData;
/* load 1st B column (partial load) */
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
/* load A rows */
vecA0 = vldrwq_s32(pInA0);
vecA1 = vldrwq_s32(pInA1);
acc0 = vrmlaldavhq(vecA0, vecB);
acc1 = vrmlaldavhq(vecA1, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
acc0 = vrmlaldavhq(vecA0, vecB);
acc1 = vrmlaldavhq(vecA1, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_q31_3x3_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM3;
q31_t *pInA2 = pInA1 + MATRIX_DIM3;
q63_t acc0, acc1, acc2;
q31x4_t vecB, vecA;
/* enable predication to disable last (4th) vector element */
mve_pred16_t p0 = vctp32q(MATRIX_DIM3);
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM3;
pInB = pSrcB->pData;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_INLINE arm_status arm_mat_mult_q31_4x4_mve(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32x4_t vecColBOffs;
q31_t *pInA0 = pInA;
q31_t *pInA1 = pInA0 + MATRIX_DIM4;
q31_t *pInA2 = pInA1 + MATRIX_DIM4;
q31_t *pInA3 = pInA2 + MATRIX_DIM4;
q63_t acc0, acc1, acc2, acc3;
q31x4_t vecB, vecA;
vecColBOffs = vidupq_u32((uint32_t)0, 4);
pInB = pSrcB->pData;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
vecA = vldrwq_s32(pInA0);
acc0 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA1);
acc1 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA2);
acc2 = vrmlaldavhq(vecA, vecB);
vecA = vldrwq_s32(pInA3);
acc3 = vrmlaldavhq(vecA, vecB);
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
q63_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
uint16_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
q31_t a0, a1, a2, a3, b0, b1, b2, b3;
#ifdef ARM_MATH_MATRIX_CHECK
q31_t const *pInB = (q31_t const *)pSrcB->pData; /* input data matrix pointer B */
q31_t const *pInA = (q31_t const *)pSrcA->pData; /* input data matrix pointer A */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t col, i = 0U, row = numRowsA; /* loop counters */
arm_status status; /* status of matrix multiplication */
uint32x4_t vecOffs, vecColBOffs;
uint32_t blkCnt, rowCnt; /* loop counters */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if (numRowsA == 1)
{
q63_t sum = (q63_t) *pInA * *pInB;
pOut[0] = (q31_t)(sum >> 31);
return (ARM_MATH_SUCCESS);
}
else if(numRowsA == 2)
return arm_mat_mult_q31_2x2_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 3)
return arm_mat_mult_q31_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_mult_q31_4x4_mve(pSrcA, pSrcB, pDst);
}
vecColBOffs = vidupq_u32((uint32_t)0, 1);
vecColBOffs = vecColBOffs * (uint32_t) (numColsB);
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
rowCnt = row >> 2;
while (rowCnt > 0U)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
j = 0U;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate the pointer pIn1 to point to the starting address of pInA */
pIn1 = pInA;
/* Apply loop unrolling and compute 4 MACs simultaneously. */
colCnt = numColsA >> 2;
/* matrix multiplication */
while (colCnt > 0U)
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i;
i = i + 4 * numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (q31_t const *)pSrcB->pData;
/*
* column loop
*/
while (col > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* Perform the multiply-accumulates */
b0 = *pIn2;
pIn2 += numColsB;
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
a0 = *pIn1++;
a1 = *pIn1++;
q31_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
q31_t const *pInA0 = pInA;
q31_t const *pInA1 = pInA0 + numColsA;
q31_t const *pInA2 = pInA1 + numColsA;
q31_t const *pInA3 = pInA2 + numColsA;
q63_t acc0, acc1, acc2, acc3;
b1 = *pIn2;
pIn2 += numColsB;
b2 = *pIn2;
pIn2 += numColsB;
acc0 = 0LL;
acc1 = 0LL;
acc2 = 0LL;
acc3 = 0LL;
sum += (q63_t) a0 *b0;
sum += (q63_t) a1 *b1;
pSrcA0Vec = (q31_t const *) pInA0;
pSrcA1Vec = (q31_t const *) pInA1;
pSrcA2Vec = (q31_t const *) pInA2;
pSrcA3Vec = (q31_t const *) pInA3;
a2 = *pIn1++;
a3 = *pIn1++;
vecOffs = vecColBOffs;
b3 = *pIn2;
pIn2 += numColsB;
/* process 1 x 4 block output */
blkCnt = numColsA >> 2;
while (blkCnt > 0U)
{
q31x4_t vecB, vecA;
sum += (q63_t) a2 *b2;
sum += (q63_t) a3 *b3;
vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
/* move Matrix B read offsets, 4 rows down */
vecOffs = vecOffs + (uint32_t) (numColsB * 4);
/* Decrement the loop counter */
colCnt--;
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4;
acc1 = vrmlaldavhaq(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4;
acc2 = vrmlaldavhaq(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4;
acc3 = vrmlaldavhaq(acc3, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numColsA & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
//vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4;
acc1 = vrmlaldavhaq(acc1, vecA, vecB);
vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4;
acc2 = vrmlaldavhaq(acc2, vecA, vecB);
vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4;
acc3 = vrmlaldavhaq(acc3, vecA, vecB);
}
acc0 = asrl(acc0, 23);
acc1 = asrl(acc1, 23);
acc2 = asrl(acc2, 23);
acc3 = asrl(acc3, 23);
px[0] = (q31_t) acc0;
px[1 * numColsB] = (q31_t) acc1;
px[2 * numColsB] = (q31_t) acc2;
px[3 * numColsB] = (q31_t) acc3;
px++;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (q31_t const *)pSrcB->pData + (numColsB - col);
}
/* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
colCnt = numColsA % 0x4U;
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += (numColsA * 4);
/*
* Decrement the row loop counter
*/
rowCnt --;
while (colCnt > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) * pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement the loop counter */
colCnt--;
}
/* Convert the result from 2.62 to 1.31 format and store in destination buffer */
*px++ = (q31_t) (sum >> 31);
/* Update the pointer pIn2 to point to the starting address of the next column */
j++;
pIn2 = (pSrcB->pData) + j;
/* Decrement the column loop counter */
col--;
} while (col > 0U);
#else
/* Run the below code for Cortex-M0 */
q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
}
rowCnt = row & 3;
while (rowCnt > 0U)
{
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate the pointer pIn1 to point to the starting address of pInA */
pIn1 = pInA;
/* Matrix A columns number of MAC operations are to be performed */
colCnt = numColsA;
/* matrix multiplication */
while (colCnt > 0U)
/*
* Output pointer is set to starting address of the row being processed
*/
px = pOut + i;
i = i + numColsB;
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB;
/*
* For every row wise process, the pInB pointer is set
* to the starting address of the pSrcB data
*/
pInB = (q31_t const *)pSrcB->pData;
/*
* column loop
*/
while (col > 0U)
{
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) * pIn1++ * *pIn2;
pIn2 += numColsB;
/*
* generate 4 columns elements
*/
/*
* Matrix A columns number of MAC operations are to be performed
*/
/* Decrement the loop counter */
colCnt--;
q31_t const *pSrcA0Vec;
q31_t const *pInA0 = pInA;
q63_t acc0;
acc0 = 0LL;
pSrcA0Vec = (q31_t const *) pInA0;
vecOffs = vecColBOffs;
/* process 1 x 4 block output */
blkCnt = numColsA >> 2;
while (blkCnt > 0U)
{
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
/* move Matrix B read offsets, 4 rows down */
vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numColsA & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
q31x4_t vecB, vecA;
vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
//vecOffs = vecOffs + (uint32_t) (numColsB * 4);
vecA = vld1q(pSrcA0Vec);
pSrcA0Vec += 4;
acc0 = vrmlaldavhaq(acc0, vecA, vecB);
}
acc0 = asrl(acc0, 23);
px[0] = (q31_t) acc0;
px++;
/*
* Decrement the column loop counter
*/
col--;
/*
* Update the pointer pInB to point to the starting address of the next column
*/
pInB = (q31_t const *)pSrcB->pData + (numColsB - col);
}
/* Convert the result from 2.62 to 1.31 format and store in destination buffer */
*px++ = (q31_t) clip_q63_to_q31(sum >> 31);
/*
* Update the pointer pInA to point to the starting address of the next row
*/
pInA += numColsA;
/*
* Decrement the row loop counter
*/
rowCnt--;
}
/* Decrement the column loop counter */
col--;
/* Update the pointer pIn2 to point to the starting address of the next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
#endif
/* Update the pointer pInA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement the row loop counter */
row--;
} while (row > 0U);
/* set status as ARM_MATH_SUCCESS */
/*
* set status as ARM_MATH_SUCCESS
*/
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_mult_q31(
const arm_matrix_instance_q31 * pSrcA,
const arm_matrix_instance_q31 * pSrcB,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
q31_t *px; /* Temporary output data matrix pointer */
q63_t sum; /* Accumulator */
uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
arm_status status; /* Status of matrix multiplication */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do
{
/* Output pointer is set to starting address of row being processed */
px = pOut + i;
/* For every row wise process, column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do
{
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initialize pointer pIn1 to point to starting address of column being processed */
pIn1 = pInA;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 MACs at a time. */
colCnt = numColsA >> 2U;
/* matrix multiplication */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Loop unrolling: Compute remaining MACs */
colCnt = numColsA % 0x4U;
#else
/* Initialize cntCnt with number of columns */
colCnt = numColsA;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (colCnt > 0U)
{
/* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
/* Perform the multiply-accumulates */
sum += (q63_t) *pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement loop counter */
colCnt--;
}
/* Convert result from 2.62 to 1.31 format and store in destination buffer */
*px++ = (q31_t) (sum >> 31);
/* Decrement column loop counter */
col--;
/* Update pointer pIn2 to point to starting address of next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update pointer pInA to point to starting address of next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement row loop counter */
row--;
} while (row > 0U);
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixMult group
@} end of MatrixMult group
*/

View File

@@ -0,0 +1,678 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_mult_q7.c
* Description: Q15 matrix multiplication
*
* $Date: 23 April 2021
*
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixMult
@{
*/
/**
* @brief Q7 matrix multiplication
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @param[in] *pState points to the array for storing intermediate results (Unused in some versions)
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
*
* \par
* The function is implemented using a 32-bit internal accumulator saturated to 1.7 format.
*
*
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_2x2_mve(
const arm_matrix_instance_q7 * pSrcA,
const arm_matrix_instance_q7 * pSrcB,
arm_matrix_instance_q7 * pDst)
{
const uint32_t MATRIX_DIM = 2;
q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q7_t *pOut = pDst->pData; /* output data matrix pointer */
uint8x16_t vecColBOffs;
q7_t *pInA0 = pInA;
q7_t *pInA1 = pInA0 + MATRIX_DIM;
q31_t acc0, acc1;
q7x16_t vecB, vecA0, vecA1;
mve_pred16_t p0 = vctp8q(MATRIX_DIM);
vecColBOffs = vidupq_u8((uint32_t)0, 2); /* MATRIX_DIM */
pInB = pSrcB->pData;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
vecA0 = vldrbq_s8(pInA0);
vecA1 = vldrbq_s8(pInA1);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_3x3_mve(
const arm_matrix_instance_q7 * pSrcA,
const arm_matrix_instance_q7 * pSrcB,
arm_matrix_instance_q7 * pDst)
{
const uint8_t MATRIX_DIM = 3;
q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q7_t *pOut = pDst->pData; /* output data matrix pointer */
uint8x16_t vecColBOffs;
q7_t *pInA0 = pInA;
q7_t *pInA1 = pInA0 + MATRIX_DIM;
q7_t *pInA2 = pInA1 + MATRIX_DIM;
q31_t acc0, acc1, acc2;
q7x16_t vecB, vecA0, vecA1, vecA2;
mve_pred16_t p0 = vctp8q(MATRIX_DIM);
vecColBOffs = vidupq_u8((uint32_t)0, 1);
vecColBOffs = vecColBOffs * MATRIX_DIM;
pInB = pSrcB->pData;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
vecA0 = vldrbq_s8(pInA0);
vecA1 = vldrbq_s8(pInA1);
vecA2 = vldrbq_s8(pInA2);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
acc2 = vmladavq_s8(vecA2, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
acc2 = vmladavq_s8(vecA2, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
acc2 = vmladavq_s8(vecA2, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_4x4_mve(
const arm_matrix_instance_q7 * pSrcA,
const arm_matrix_instance_q7 * pSrcB,
arm_matrix_instance_q7 * pDst)
{
const uint32_t MATRIX_DIM = 4;
q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q7_t *pOut = pDst->pData; /* output data matrix pointer */
uint8x16_t vecColBOffs;
q7_t *pInA0 = pInA;
q7_t *pInA1 = pInA0 + MATRIX_DIM;
q7_t *pInA2 = pInA1 + MATRIX_DIM;
q7_t *pInA3 = pInA2 + MATRIX_DIM;
q31_t acc0, acc1, acc2, acc3;
q7x16_t vecB, vecA0, vecA1, vecA2, vecA3;
mve_pred16_t p0 = vctp8q(MATRIX_DIM);
vecColBOffs = vidupq_u8((uint32_t)0, 4);
pInB = pSrcB->pData;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
vecA0 = vldrbq_s8(pInA0);
vecA1 = vldrbq_s8(pInA1);
vecA2 = vldrbq_s8(pInA2);
vecA3 = vldrbq_s8(pInA3);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
acc2 = vmladavq_s8(vecA2, vecB);
acc3 = vmladavq_s8(vecA3, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
acc2 = vmladavq_s8(vecA2, vecB);
acc3 = vmladavq_s8(vecA3, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
acc2 = vmladavq_s8(vecA2, vecB);
acc3 = vmladavq_s8(vecA3, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
pOut++;
/* move to next B column */
pInB = pInB + 1;
vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
acc0 = vmladavq_s8(vecA0, vecB);
acc1 = vmladavq_s8(vecA1, vecB);
acc2 = vmladavq_s8(vecA2, vecB);
acc3 = vmladavq_s8(vecA3, vecB);
pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
/*
* Return to application
*/
return (ARM_MATH_SUCCESS);
}
arm_status arm_mat_mult_q7(
const arm_matrix_instance_q7 * pSrcA,
const arm_matrix_instance_q7 * pSrcB,
arm_matrix_instance_q7 * pDst,
q7_t * pState)
{
q7_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q7 type */
q7_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q7 type */
q7_t *pInA2;
q7_t *pInB2;
q7_t *px; /* Temporary output data matrix pointer */
q7_t *px2; /* Temporary output data matrix pointer */
uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
q7_t *pSrcBT = pState; /* input data matrix pointer for transpose */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
arm_matrix_instance_q7 BT;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* small squared matrix specialized routines */
if(numRowsA == numColsB && numColsB == numColsA) {
if(numRowsA == 2)
return arm_mat_mult_q7_2x2_mve(pSrcA, pSrcB, pDst);
else if(numRowsA == 3)
return arm_mat_mult_q7_3x3_mve(pSrcA, pSrcB, pDst);
else if (numRowsA == 4)
return arm_mat_mult_q7_4x4_mve(pSrcA, pSrcB, pDst);
}
/*
* Matrix transpose
*/
BT.numRows = numColsB;
BT.numCols = numRowsB;
BT.pData = pSrcBT;
arm_mat_trans_q7(pSrcB, &BT);
/*
* Reset the variables for the usage in the following multiplication process
*/
i = 0;
row = numRowsA >> 1;
px = pDst->pData;
px2 = px + numColsB;
/*
* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
*/
/*
* row loop
*/
while (row > 0u)
{
/*
* For every row wise process, the column loop counter is to be initiated
*/
col = numColsB >> 1;
/*
* For every row wise process, the pIn2 pointer is set
* to the starting address of the transposed pSrcB data
*/
pInB = pSrcBT;
pInB2 = pInB + numRowsB;
j = 0;
/*
* column loop
*/
while (col > 0u)
{
q7_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
q7x16_t vecA, vecA2, vecB, vecB2;
q31_t acc0, acc1, acc2, acc3;
/*
* Initiate the pointer pIn1 to point to the starting address of the column being processed
*/
pInA = pSrcA->pData + i;
pInA2 = pInA + numColsA;
pInB = pSrcBT + j;
pInB2 = pInB + numRowsB;
pSrcAVec = (q7_t const *) pInA;
pSrcA2Vec = (q7_t const *)pInA2;
pSrcBVec = (q7_t const *) pInB;
pSrcB2Vec = (q7_t const *)pInB2;
acc0 = 0L;
acc1 = 0L;
acc2 = 0L;
acc3 = 0L;
vecA = vld1q(pSrcAVec);
pSrcAVec += 16;
blkCnt = numColsA >> 4;
while (blkCnt > 0U)
{
vecB = vld1q(pSrcBVec);
pSrcBVec += 16;
acc0 = vmladavaq_s8(acc0, vecA, vecB);
vecA2 = vld1q(pSrcA2Vec);
pSrcA2Vec += 16;
acc1 = vmladavaq_s8(acc1, vecA2, vecB);
vecB2 = vld1q(pSrcB2Vec);
pSrcB2Vec += 16;
acc2 = vmladavaq_s8(acc2, vecA, vecB2);
vecA = vld1q(pSrcAVec);
pSrcAVec += 16;
acc3 = vmladavaq_s8(acc3, vecA2, vecB2);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numColsA & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecB = vld1q(pSrcBVec);
acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
vecA2 = vld1q(pSrcA2Vec);
acc1 = vmladavaq_p_s8(acc1, vecA2, vecB, p0);
vecB2 = vld1q(pSrcB2Vec);
acc2 = vmladavaq_p_s8(acc2, vecA, vecB2, p0);
vecA = vld1q(pSrcAVec);
acc3 = vmladavaq_p_s8(acc3, vecA2, vecB2, p0);
}
*px++ = (q7_t) __SSAT(acc0 >> 7, 8);
*px++ = (q7_t) __SSAT(acc2 >> 7, 8);
*px2++ = (q7_t) __SSAT(acc1 >> 7, 8);
*px2++ = (q7_t) __SSAT(acc3 >> 7, 8);
j += numRowsB * 2;
/*
* Decrement the column loop counter
*/
col--;
}
i = i + numColsA * 2;
px = px2 + (numColsB & 1u);
px2 = px + numColsB;
/*
* Decrement the row loop counter
*/
row--;
}
/*
* Compute remaining row and/or column below
*/
if (numColsB & 1u)
{
row = numRowsA & (~0x1); //avoid redundant computation
px = pDst->pData + numColsB - 1;
i = 0;
/*
* row loop
*/
while (row > 0)
{
q7_t const *pSrcAVec, *pSrcBVec;
q7x16_t vecA, vecB;
q63_t acc0;
/*
* point to last column in matrix B
*/
pInB = pSrcBT + numRowsB * (numColsB - 1);
pInA = pSrcA->pData + i;
pSrcAVec = (q7_t const *) pInA;
pSrcBVec = (q7_t const *) pInB;
acc0 = 0LL;
blkCnt = (numColsA) >> 4;
while (blkCnt > 0U)
{
vecA = vld1q(pSrcAVec);
pSrcAVec += 16;
vecB = vld1q(pSrcBVec);
pSrcBVec += 16;
acc0 = vmladavaq_s8(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numColsA & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
}
*px = (q7_t) __SSAT(acc0 >> 7, 8);
px += numColsB;
i += numColsA;
/*
* Decrement the row loop counter
*/
row--;
}
}
if (numRowsA & 1u)
{
col = numColsB;
i = 0u;
/*
* point to last row in output matrix
*/
px = pDst->pData + (numColsB) * (numRowsA - 1);
/*
* col loop
*/
while (col > 0)
{
q7_t const *pSrcAVec, *pSrcBVec;
q7x16_t vecA, vecB;
q63_t acc0;
/*
* point to last row in matrix A
*/
pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
pInB = pSrcBT + i;
/*
* Set the variable sum, that acts as accumulator, to zero
*/
pSrcAVec = (q7_t const *) pInA;
pSrcBVec = (q7_t const *) pInB;
acc0 = 0LL;
blkCnt = (numColsA) >> 4;
while (blkCnt > 0U)
{
vecA = vld1q(pSrcAVec);
pSrcAVec += 16;
vecB = vld1q(pSrcBVec);
pSrcBVec += 16;
acc0 = vmladavaq_s8(acc0, vecA, vecB);
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numColsA & 0xF;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp8q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
}
*px++ = (q7_t) __SSAT(acc0 >> 7, 8);
i += numColsA;
/*
* Decrement the col loop counter
*/
col--;
}
}
/*
* Return to application
*/
status = ARM_MATH_SUCCESS;
}
return(status);
}
#else
arm_status arm_mat_mult_q7(const arm_matrix_instance_q7 *pSrcA, const arm_matrix_instance_q7 *pSrcB, arm_matrix_instance_q7 *pDst, q7_t *pState)
{
q31_t sum; /* accumulator */
q7_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
q7_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
q7_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q7 type */
q7_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q7 type */
q7_t *pOut = pDst->pData; /* output data matrix pointer */
q7_t *px; /* Temporary output data matrix pointer */
uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
arm_status status; /* status of matrix multiplication */
(void)pState;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numCols != pSrcB->numRows) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcB->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
/* row loop */
do {
/* Output pointer is set to starting address of the row being processed */
px = pOut + i;
/* For every row wise process, the column loop counter is to be initiated */
col = numColsB;
/* For every row wise process, the pIn2 pointer is set
** to the starting address of the pSrcB data */
pIn2 = pSrcB->pData;
/* column loop */
do {
/* Set the variable sum, that acts as accumulator, to zero */
sum = 0;
/* Initiate the pointer pIn1 to point to the starting address of pSrcA */
pIn1 = pInA;
/* Matrix A columns number of MAC operations are to be performed */
colCnt = numColsA;
/* matrix multiplication */
while (colCnt > 0U) {
/* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
/* Perform the multiply-accumulates */
sum += (q31_t)*pIn1++ * *pIn2;
pIn2 += numColsB;
/* Decrement the loop counter */
colCnt--;
}
/* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
/* Saturate and store the result in the destination buffer */
*px++ = (q7_t)__SSAT((sum >> 7), 8);
/* Decrement the column loop counter */
col--;
/* Update the pointer pIn2 to point to the starting address of the next column */
pIn2 = pInB + (numColsB - col);
} while (col > 0U);
/* Update the pointer pSrcA to point to the starting address of the next row */
i = i + numColsB;
pInA = pInA + numColsA;
/* Decrement the row loop counter */
row--;
} while (row > 0U);
/* set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of MatrixMult group
*/

View File

@@ -0,0 +1,208 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_scale_f16.c
* Description: Multiplies a floating-point matrix by a scalar
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixScale
@{
*/
/**
@brief Floating-point matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scale scale factor to be applied
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_scale_f16(
const arm_matrix_instance_f16 * pSrc,
float16_t scale,
arm_matrix_instance_f16 * pDst)
{
arm_status status; /* status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float16_t *pIn = pSrc->pData; /* input data matrix pointer */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
f16x8_t vecIn, vecOut, vecScale;
float16_t const *pInVec;
pInVec = (float16_t const *) pIn;
vecScale = vdupq_n_f16(scale);
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/*
* C(m,n) = A(m,n) * scale
* Scaling and results are stored in the destination buffer.
*/
vecIn = vld1q(pInVec);
pInVec += 8;
vecOut = vmulq_f16(vecIn, vecScale);
vst1q(pOut, vecOut);
pOut += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecIn = vld1q(pInVec);
vecOut = vecIn * scale;
vstrhq_p(pOut, vecOut, p0);
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_scale_f16(
const arm_matrix_instance_f16 * pSrc,
float16_t scale,
arm_matrix_instance_f16 * pDst)
{
float16_t *pIn = pSrc->pData; /* Input data matrix pointer */
float16_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counters */
arm_status status; /* Status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (_Float16)(*pIn++) * (_Float16)scale;
*pOut++ = (_Float16)(*pIn++) * (_Float16)scale;
*pOut++ = (_Float16)(*pIn++) * (_Float16)scale;
*pOut++ = (_Float16)(*pIn++) * (_Float16)scale;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (_Float16)(*pIn++) * (_Float16)scale;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixScale group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_scale_f32.c
* Description: Multiplies a floating-point matrix by a scalar
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,45 +26,114 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @defgroup MatrixScale Matrix Scale
*
* Multiplies a matrix by a scalar. This is accomplished by multiplying each element in the
* matrix by the scalar. For example:
* \image html MatrixScale.gif "Matrix Scaling of a 3 x 3 matrix"
*
* The function checks to make sure that the input and output matrices are of the same size.
*
* In the fixed-point Q15 and Q31 functions, <code>scale</code> is represented by
* a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
* The shift allows the gain of the scaling operation to exceed 1.0.
* The overall scale factor applied to the fixed-point data is
* <pre>
* scale = scaleFract * 2^shift.
* </pre>
@defgroup MatrixScale Matrix Scale
Multiplies a matrix by a scalar. This is accomplished by multiplying each element in the
matrix by the scalar. For example:
\image html MatrixScale.gif "Matrix Scaling of a 3 x 3 matrix"
The function checks to make sure that the input and output matrices are of the same size.
In the fixed-point Q15 and Q31 functions, <code>scale</code> is represented by
a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.
The shift allows the gain of the scaling operation to exceed 1.0.
The overall scale factor applied to the fixed-point data is
<pre>
scale = scaleFract * 2^shift.
</pre>
*/
/**
* @addtogroup MatrixScale
* @{
@addtogroup MatrixScale
@{
*/
/**
* @brief Floating-point matrix scaling.
* @param[in] *pSrc points to input matrix structure
* @param[in] scale scale factor to be applied
* @param[out] *pDst points to output matrix structure
* @return The function returns either <code>ARM_MATH_SIZE_MISMATCH</code>
* or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
@brief Floating-point matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scale scale factor to be applied
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
arm_matrix_instance_f32 * pDst)
{
arm_status status; /* status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float32_t *pIn = pSrc->pData; /* input data matrix pointer */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
f32x4_t vecIn, vecOut;
float32_t const *pInVec;
pInVec = (float32_t const *) pIn;
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/*
* C(m,n) = A(m,n) * scale
* Scaling and results are stored in the destination buffer.
*/
vecIn = vld1q(pInVec);
pInVec += 4;
vecOut = vecIn * scale;
vst1q(pOut, vecOut);
pOut += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecIn = vld1q(pInVec);
vecOut = vecIn * scale;
vstrwq_p(pOut, vecOut, p0);
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON_EXPERIMENTAL)
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
@@ -76,12 +145,6 @@ arm_status arm_mat_scale_f32(
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix scaling */
#if defined (ARM_MATH_DSP)
float32_t in1, in2, in3, in4; /* temporary variables */
float32_t out1, out2, out3, out4; /* temporary variables */
#endif // #if defined (ARM_MATH_DSP)
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
@@ -93,37 +156,23 @@ arm_status arm_mat_scale_f32(
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float32x4_t vec1;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Loop Unrolling */
blkCnt = numSamples >> 2;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scaling and results are stored in the destination buffer. */
in1 = pIn[0];
in2 = pIn[1];
in3 = pIn[2];
in4 = pIn[3];
out1 = in1 * scale;
out2 = in2 * scale;
out3 = in3 * scale;
out4 = in4 * scale;
pOut[0] = out1;
pOut[1] = out2;
pOut[2] = out3;
pOut[3] = out4;
vec1 = vld1q_f32(pIn);
res = vmulq_f32(vec1, vdupq_n_f32(scale));
vst1q_f32(pOut, res);
/* update pointers to process next sampels */
pIn += 4U;
@@ -137,15 +186,6 @@ arm_status arm_mat_scale_f32(
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
@@ -163,7 +203,85 @@ arm_status arm_mat_scale_f32(
/* Return to application */
return (status);
}
#else
arm_status arm_mat_scale_f32(
const arm_matrix_instance_f32 * pSrc,
float32_t scale,
arm_matrix_instance_f32 * pDst)
{
float32_t *pIn = pSrc->pData; /* Input data matrix pointer */
float32_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counters */
arm_status status; /* Status of matrix scaling */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
*pOut++ = (*pIn++) * scale;
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * scale */
/* Scale and store result in destination buffer. */
*pOut++ = (*pIn++) * scale;
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixScale group
@} end of MatrixScale group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_scale_q15.c
* Description: Multiplies a Q15 matrix by a scalar
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,139 +26,102 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixScale
* @{
@addtogroup MatrixScale
@{
*/
/**
* @brief Q15 matrix scaling.
* @param[in] *pSrc points to input matrix
* @param[in] scaleFract fractional portion of the scale factor
* @param[in] shift number of bits to shift the result by
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
* \par
* The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
* These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
*/
@brief Q15 matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scaleFract fractional portion of the scale factor
@param[in] shift number of bits to shift the result by
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.
These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_scale_q15(
const arm_matrix_instance_q15 * pSrc,
q15_t scaleFract,
int32_t shift,
arm_matrix_instance_q15 * pDst)
q15_t scaleFract,
int32_t shift,
arm_matrix_instance_q15 * pDst)
{
q15_t *pIn = pSrc->pData; /* input data matrix pointer */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
int32_t totShift = 15 - shift; /* total shift to apply after scaling */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix scaling */
arm_status status; /* Status of matrix scaling */
q15_t *pIn = pSrc->pData; /* input data matrix pointer */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
q15x8_t vecIn, vecOut;
q15_t const *pInVec;
int32_t totShift = shift + 1; /* shift to apply after scaling */
pInVec = (q15_t const *) pIn;
#if defined (ARM_MATH_DSP)
#ifdef ARM_MATH_MATRIX_CHECK
q15_t in1, in2, in3, in4;
q31_t out1, out2, out3, out4;
q31_t inA1, inA2;
#endif // #if defined (ARM_MATH_DSP)
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch */
if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif // #ifdef ARM_MATH_MATRIX_CHECK
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in the input matrix */
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Loop Unrolling */
blkCnt = numSamples >> 2;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and then store the results in the destination buffer. */
/* Reading 2 inputs from memory */
inA1 = _SIMD32_OFFSET(pIn);
inA2 = _SIMD32_OFFSET(pIn + 2);
/*
* C(m,n) = A(m,n) * scale
* Scaling and results are stored in the destination buffer.
*/
vecIn = vld1q(pInVec); pInVec += 8;
/* C = A * scale */
/* Scale the inputs and then store the 2 results in the destination buffer
* in single cycle by packing the outputs */
out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
out2 = (q31_t) ((q15_t) inA1 * scaleFract);
out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
out4 = (q31_t) ((q15_t) inA2 * scaleFract);
/* multiply input with scaler value */
vecOut = vmulhq(vecIn, vdupq_n_s16(scaleFract));
/* apply shifting */
vecOut = vqshlq_r(vecOut, totShift);
out1 = out1 >> totShift;
inA1 = _SIMD32_OFFSET(pIn + 4);
out2 = out2 >> totShift;
inA2 = _SIMD32_OFFSET(pIn + 6);
out3 = out3 >> totShift;
out4 = out4 >> totShift;
vst1q(pOut, vecOut); pOut += 8;
in1 = (q15_t) (__SSAT(out1, 16));
in2 = (q15_t) (__SSAT(out2, 16));
in3 = (q15_t) (__SSAT(out3, 16));
in4 = (q15_t) (__SSAT(out4, 16));
_SIMD32_OFFSET(pOut) = __PKHBT(in2, in1, 16);
_SIMD32_OFFSET(pOut + 2) = __PKHBT(in4, in3, 16);
/* update pointers to process next sampels */
pIn += 4U;
pOut += 4U;
/* Decrement the numSamples loop counter */
blkCnt--;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and then store the results in the destination buffer. */
*pOut++ =
(q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> totShift, 16));
/* Decrement the numSamples loop counter */
blkCnt--;
mve_pred16_t p0 = vctp16q(blkCnt);
vecIn = vld1q(pInVec); pInVec += 8;
vecOut = vmulhq(vecIn, vdupq_n_s16(scaleFract));
vecOut = vqshlq_r(vecOut, totShift);
vstrhq_p(pOut, vecOut, p0);
}
/* Set status as ARM_MATH_SUCCESS */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -166,6 +129,121 @@ arm_status arm_mat_scale_q15(
return (status);
}
#else
arm_status arm_mat_scale_q15(
const arm_matrix_instance_q15 * pSrc,
q15_t scaleFract,
int32_t shift,
arm_matrix_instance_q15 * pDst)
{
q15_t *pIn = pSrc->pData; /* Input data matrix pointer */
q15_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counter */
arm_status status; /* Status of matrix scaling */
int32_t kShift = 15 - shift; /* Total shift to apply after scaling */
#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
q31_t inA1, inA2;
q31_t out1, out2, out3, out4; /* Temporary output variables */
q15_t in1, in2, in3, in4; /* Temporary input variables */
#endif
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
#if defined (ARM_MATH_DSP)
/* read 2 times 2 samples at a time from source */
inA1 = read_q15x2_ia (&pIn);
inA2 = read_q15x2_ia (&pIn);
/* Scale inputs and store result in temporary variables
* in single cycle by packing the outputs */
out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
out2 = (q31_t) ((q15_t) (inA1 ) * scaleFract);
out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
out4 = (q31_t) ((q15_t) (inA2 ) * scaleFract);
/* apply shifting */
out1 = out1 >> kShift;
out2 = out2 >> kShift;
out3 = out3 >> kShift;
out4 = out4 >> kShift;
/* saturate the output */
in1 = (q15_t) (__SSAT(out1, 16));
in2 = (q15_t) (__SSAT(out2, 16));
in3 = (q15_t) (__SSAT(out3, 16));
in4 = (q15_t) (__SSAT(out4, 16));
/* store result to destination */
write_q15x2_ia (&pOut, __PKHBT(in2, in1, 16));
write_q15x2_ia (&pOut, __PKHBT(in4, in3, 16));
#else
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and store result in destination buffer. */
*pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixScale group
@} end of MatrixScale group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_scale_q31.c
* Description: Multiplies a Q31 matrix by a scalar
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,155 +26,205 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixScale
* @{
@addtogroup MatrixScale
@{
*/
/**
* @brief Q31 matrix scaling.
* @param[in] *pSrc points to input matrix
* @param[in] scaleFract fractional portion of the scale factor
* @param[in] shift number of bits to shift the result by
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* @details
* <b>Scaling and Overflow Behavior:</b>
* \par
* The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
* These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
*/
@brief Q31 matrix scaling.
@param[in] pSrc points to input matrix
@param[in] scaleFract fractional portion of the scale factor
@param[in] shift number of bits to shift the result by
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.
These are multiplied to yield a 2.62 intermediate result which is shifted with saturation to 1.31 format.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_scale_q31(
const arm_matrix_instance_q31 * pSrc,
q31_t scaleFract,
int32_t shift,
arm_matrix_instance_q31 * pDst)
q31_t scaleFract,
int32_t shift,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn = pSrc->pData; /* input data matrix pointer */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
int32_t totShift = shift + 1; /* shift to apply after scaling */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix scaling */
q31_t in1, in2, out1; /* temporary variabels */
q31_t *pIn = pSrc->pData; /* input data matrix pointer */
q31_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
q31x4_t vecIn, vecOut;
q31_t const *pInVec;
int32_t totShift = shift + 1; /* shift to apply after scaling */
arm_status status; /* Status of matrix scaling */
#if defined (ARM_MATH_DSP)
pInVec = (q31_t const *) pIn;
#ifdef ARM_MATH_MATRIX_CHECK
q31_t in3, in4, out2, out3, out4; /* temporary variables */
#endif // #ifndef ARM_MAT_CM0
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch */
if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif // #ifdef ARM_MATH_MATRIX_CHECK
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in the input matrix */
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/*
* C(m,n) = A(m,n) * scale
* Scaling and results are stored in the destination buffer.
*/
vecIn = vld1q(pInVec);
pInVec += 4;
/* multiply input with scaler value */
vecOut = vmulhq(vecIn, vdupq_n_s32(scaleFract));
/* apply shifting */
vecOut = vqshlq_r(vecOut, totShift);
vst1q(pOut, vecOut);
pOut += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecIn = vld1q(pInVec);
pInVec += 4;
vecOut = vmulhq(vecIn, vdupq_n_s32(scaleFract));
vecOut = vqshlq_r(vecOut, totShift);
vstrwq_p(pOut, vecOut, p0);
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_scale_q31(
const arm_matrix_instance_q31 * pSrc,
q31_t scaleFract,
int32_t shift,
arm_matrix_instance_q31 * pDst)
{
q31_t *pIn = pSrc->pData; /* Input data matrix pointer */
q31_t *pOut = pDst->pData; /* Output data matrix pointer */
uint32_t numSamples; /* Total number of elements in the matrix */
uint32_t blkCnt; /* Loop counter */
arm_status status; /* Status of matrix scaling */
int32_t kShift = shift + 1; /* Shift to apply after scaling */
q31_t in, out; /* Temporary variabels */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrc->numRows != pDst->numRows) ||
(pSrc->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
#if defined (ARM_MATH_DSP)
#if defined (ARM_MATH_LOOPUNROLL)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Loop Unrolling */
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Read values from input */
in1 = *pIn;
in2 = *(pIn + 1);
in3 = *(pIn + 2);
in4 = *(pIn + 3);
/* multiply input with scaler value */
in1 = ((q63_t) in1 * scaleFract) >> 32;
in2 = ((q63_t) in2 * scaleFract) >> 32;
in3 = ((q63_t) in3 * scaleFract) >> 32;
in4 = ((q63_t) in4 * scaleFract) >> 32;
/* Scale, saturate and store result in destination buffer. */
in = *pIn++; /* read four inputs from source */
in = ((q63_t) in * scaleFract) >> 32; /* multiply input with scaler value */
out = in << kShift; /* apply shifting */
if (in != (out >> kShift)) /* saturate the results. */
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out; /* Store result destination */
/* apply shifting */
out1 = in1 << totShift;
out2 = in2 << totShift;
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
/* saturate the results. */
if (in1 != (out1 >> totShift))
out1 = 0x7FFFFFFF ^ (in1 >> 31);
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
if (in2 != (out2 >> totShift))
out2 = 0x7FFFFFFF ^ (in2 >> 31);
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
out3 = in3 << totShift;
out4 = in4 << totShift;
*pOut = out1;
*(pOut + 1) = out2;
if (in3 != (out3 >> totShift))
out3 = 0x7FFFFFFF ^ (in3 >> 31);
if (in4 != (out4 >> totShift))
out4 = 0x7FFFFFFF ^ (in4 >> 31);
*(pOut + 2) = out3;
*(pOut + 3) = out4;
/* update pointers to process next sampels */
pIn += 4U;
pOut += 4U;
/* Decrement the numSamples loop counter */
/* Decrement loop counter */
blkCnt--;
}
/* If the numSamples is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) * k */
/* Scale, saturate and then store the results in the destination buffer. */
in1 = *pIn++;
in2 = ((q63_t) in1 * scaleFract) >> 32;
/* Scale, saturate and store result in destination buffer. */
in = *pIn++;
in = ((q63_t) in * scaleFract) >> 32;
out = in << kShift;
if (in != (out >> kShift))
out = 0x7FFFFFFF ^ (in >> 31);
*pOut++ = out;
out1 = in2 << totShift;
if (in2 != (out1 >> totShift))
out1 = 0x7FFFFFFF ^ (in2 >> 31);
*pOut++ = out1;
/* Decrement the numSamples loop counter */
/* Decrement loop counter */
blkCnt--;
}
@@ -185,7 +235,8 @@ arm_status arm_mat_scale_q31(
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixScale group
@} end of MatrixScale group
*/

View File

@@ -0,0 +1,234 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_solve_lower_triangular_f16.c
* Description: Solve linear system LT X = A with LT lower triangular matrix
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
* @brief Solve LT . X = A where LT is a lower triangular matrix
* @param[in] lt The lower triangular matrix
* @param[in] a The matrix a
* @param[out] dst The solution X of LT . X = A
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_solve_lower_triangular_f16(
const arm_matrix_instance_f16 * lt,
const arm_matrix_instance_f16 * a,
arm_matrix_instance_f16 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float16_t *pX = dst->pData;
float16_t *pLT = lt->pData;
float16_t *pA = a->pData;
float16_t *lt_row;
float16_t *a_col;
_Float16 invLT;
f16x8_t vecA;
f16x8_t vecX;
for(i=0; i < n ; i++)
{
for(j=0; j+7 < cols; j += 8)
{
vecA = vld1q_f16(&pA[i * cols + j]);
for(k=0; k < i; k++)
{
vecX = vld1q_f16(&pX[cols*k+j]);
vecA = vfmsq(vecA,vdupq_n_f16(pLT[n*i + k]),vecX);
}
if ((_Float16)pLT[n*i + i]==0.0f16)
{
return(ARM_MATH_SINGULAR);
}
invLT = 1.0f16 / (_Float16)pLT[n*i + i];
vecA = vmulq(vecA,vdupq_n_f16(invLT));
vst1q(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
lt_row = &pLT[n*i];
_Float16 tmp=a_col[i * cols];
for(k=0; k < i; k++)
{
tmp -= (_Float16)lt_row[k] * (_Float16)pX[cols*k+j];
}
if ((_Float16)lt_row[i]==0.0f16)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / (_Float16)lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_solve_lower_triangular_f16(
const arm_matrix_instance_f16 * lt,
const arm_matrix_instance_f16 * a,
arm_matrix_instance_f16 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float16_t *pX = dst->pData;
float16_t *pLT = lt->pData;
float16_t *pA = a->pData;
float16_t *lt_row;
float16_t *a_col;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=0; i < n ; i++)
{
lt_row = &pLT[n*i];
float16_t tmp=a_col[i * cols];
for(k=0; k < i; k++)
{
tmp -= (_Float16)lt_row[k] * (_Float16)pX[cols*k+j];
}
if ((_Float16)lt_row[i]==0.0f16)
{
return(ARM_MATH_SINGULAR);
}
tmp = (_Float16)tmp / (_Float16)lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixInv group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,333 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_solve_lower_triangular_f32.c
* Description: Solve linear system LT X = A with LT lower triangular matrix
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
* @brief Solve LT . X = A where LT is a lower triangular matrix
* @param[in] lt The lower triangular matrix
* @param[in] a The matrix a
* @param[out] dst The solution X of LT . X = A
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_solve_lower_triangular_f32(
const arm_matrix_instance_f32 * lt,
const arm_matrix_instance_f32 * a,
arm_matrix_instance_f32 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float32_t *pX = dst->pData;
float32_t *pLT = lt->pData;
float32_t *pA = a->pData;
float32_t *lt_row;
float32_t *a_col;
float32_t invLT;
f32x4_t vecA;
f32x4_t vecX;
for(i=0; i < n ; i++)
{
for(j=0; j+3 < cols; j += 4)
{
vecA = vld1q_f32(&pA[i * cols + j]);
for(k=0; k < i; k++)
{
vecX = vld1q_f32(&pX[cols*k+j]);
vecA = vfmsq(vecA,vdupq_n_f32(pLT[n*i + k]),vecX);
}
if (pLT[n*i + i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
invLT = 1.0f / pLT[n*i + i];
vecA = vmulq(vecA,vdupq_n_f32(invLT));
vst1q(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
lt_row = &pLT[n*i];
float32_t tmp=a_col[i * cols];
for(k=0; k < i; k++)
{
tmp -= lt_row[k] * pX[cols*k+j];
}
if (lt_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_solve_lower_triangular_f32(
const arm_matrix_instance_f32 * lt,
const arm_matrix_instance_f32 * a,
arm_matrix_instance_f32 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float32_t *pX = dst->pData;
float32_t *pLT = lt->pData;
float32_t *pA = a->pData;
float32_t *lt_row;
float32_t *a_col;
float32_t invLT;
f32x4_t vecA;
f32x4_t vecX;
for(i=0; i < n ; i++)
{
for(j=0; j+3 < cols; j += 4)
{
vecA = vld1q_f32(&pA[i * cols + j]);
for(k=0; k < i; k++)
{
vecX = vld1q_f32(&pX[cols*k+j]);
vecA = vfmsq_f32(vecA,vdupq_n_f32(pLT[n*i + k]),vecX);
}
if (pLT[n*i + i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
invLT = 1.0f / pLT[n*i + i];
vecA = vmulq_f32(vecA,vdupq_n_f32(invLT));
vst1q_f32(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
lt_row = &pLT[n*i];
float32_t tmp=a_col[i * cols];
for(k=0; k < i; k++)
{
tmp -= lt_row[k] * pX[cols*k+j];
}
if (lt_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_solve_lower_triangular_f32(
const arm_matrix_instance_f32 * lt,
const arm_matrix_instance_f32 * a,
arm_matrix_instance_f32 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
float32_t *pX = dst->pData;
float32_t *pLT = lt->pData;
float32_t *pA = a->pData;
float32_t *lt_row;
float32_t *a_col;
n = dst->numRows;
cols = dst -> numCols;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=0; i < n ; i++)
{
float32_t tmp=a_col[i * cols];
lt_row = &pLT[n*i];
for(k=0; k < i; k++)
{
tmp -= lt_row[k] * pX[cols*k+j];
}
if (lt_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixInv group
*/

View File

@@ -0,0 +1,124 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_solve_lower_triangular_f64.c
* Description: Solve linear system LT X = A with LT lower triangular matrix
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
* @brief Solve LT . X = A where LT is a lower triangular matrix
* @param[in] lt The lower triangular matrix
* @param[in] a The matrix a
* @param[out] dst The solution X of LT . X = A
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/
arm_status arm_mat_solve_lower_triangular_f64(
const arm_matrix_instance_f64 * lt,
const arm_matrix_instance_f64 * a,
arm_matrix_instance_f64 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((lt->numRows != lt->numCols) ||
(lt->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* a1 b1 c1 x1 = a1
b2 c2 x2 a2
c3 x3 a3
x3 = a3 / c3
x2 = (a2 - c2 x3) / b2
*/
int i,j,k,n,cols;
float64_t *pX = dst->pData;
float64_t *pLT = lt->pData;
float64_t *pA = a->pData;
float64_t *lt_row;
float64_t *a_col;
n = dst->numRows;
cols = dst->numCols;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=0; i < n ; i++)
{
float64_t tmp=a_col[i * cols];
lt_row = &pLT[n*i];
for(k=0; k < i; k++)
{
tmp -= lt_row[k] * pX[cols*k+j];
}
if (lt_row[i]==0.0)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / lt_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixInv group
*/

View File

@@ -0,0 +1,226 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_solve_upper_triangular_f16.c
* Description: Solve linear system UT X = A with UT upper triangular matrix
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
* @brief Solve UT . X = A where UT is an upper triangular matrix
* @param[in] ut The upper triangular matrix
* @param[in] a The matrix a
* @param[out] dst The solution X of UT . X = A
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_solve_upper_triangular_f16(
const arm_matrix_instance_f16 * ut,
const arm_matrix_instance_f16 * a,
arm_matrix_instance_f16 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float16_t *pX = dst->pData;
float16_t *pUT = ut->pData;
float16_t *pA = a->pData;
float16_t *ut_row;
float16_t *a_col;
_Float16 invUT;
f16x8_t vecA;
f16x8_t vecX;
for(i=n-1; i >= 0 ; i--)
{
for(j=0; j+7 < cols; j +=8)
{
vecA = vld1q_f16(&pA[i * cols + j]);
for(k=n-1; k > i; k--)
{
vecX = vld1q_f16(&pX[cols*k+j]);
vecA = vfmsq(vecA,vdupq_n_f16(pUT[n*i + k]),vecX);
}
if ((_Float16)pUT[n*i + i]==0.0f16)
{
return(ARM_MATH_SINGULAR);
}
invUT = 1.0f16 / (_Float16)pUT[n*i + i];
vecA = vmulq(vecA,vdupq_n_f16(invUT));
vst1q(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
ut_row = &pUT[n*i];
_Float16 tmp=a_col[i * cols];
for(k=n-1; k > i; k--)
{
tmp -= (_Float16)ut_row[k] * (_Float16)pX[cols*k+j];
}
if ((_Float16)ut_row[i]==0.0f16)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / (_Float16)ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_solve_upper_triangular_f16(
const arm_matrix_instance_f16 * ut,
const arm_matrix_instance_f16 * a,
arm_matrix_instance_f16 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float16_t *pX = dst->pData;
float16_t *pUT = ut->pData;
float16_t *pA = a->pData;
float16_t *ut_row;
float16_t *a_col;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=n-1; i >= 0 ; i--)
{
ut_row = &pUT[n*i];
float16_t tmp=a_col[i * cols];
for(k=n-1; k > i; k--)
{
tmp -= (_Float16)ut_row[k] * (_Float16)pX[cols*k+j];
}
if ((_Float16)ut_row[i]==0.0f16)
{
return(ARM_MATH_SINGULAR);
}
tmp = (_Float16)tmp / (_Float16)ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixInv group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -0,0 +1,319 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_solve_upper_triangular_f32.c
* Description: Solve linear system UT X = A with UT upper triangular matrix
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
* @brief Solve UT . X = A where UT is an upper triangular matrix
* @param[in] ut The upper triangular matrix
* @param[in] a The matrix a
* @param[out] dst The solution X of UT . X = A
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
#include "arm_helium_utils.h"
arm_status arm_mat_solve_upper_triangular_f32(
const arm_matrix_instance_f32 * ut,
const arm_matrix_instance_f32 * a,
arm_matrix_instance_f32 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float32_t *pX = dst->pData;
float32_t *pUT = ut->pData;
float32_t *pA = a->pData;
float32_t *ut_row;
float32_t *a_col;
float32_t invUT;
f32x4_t vecA;
f32x4_t vecX;
for(i=n-1; i >= 0 ; i--)
{
for(j=0; j+3 < cols; j +=4)
{
vecA = vld1q_f32(&pA[i * cols + j]);
for(k=n-1; k > i; k--)
{
vecX = vld1q_f32(&pX[cols*k+j]);
vecA = vfmsq(vecA,vdupq_n_f32(pUT[n*i + k]),vecX);
}
if (pUT[n*i + i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
invUT = 1.0f / pUT[n*i + i];
vecA = vmulq(vecA,vdupq_n_f32(invUT));
vst1q(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
ut_row = &pUT[n*i];
float32_t tmp=a_col[i * cols];
for(k=n-1; k > i; k--)
{
tmp -= ut_row[k] * pX[cols*k+j];
}
if (ut_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_solve_upper_triangular_f32(
const arm_matrix_instance_f32 * ut,
const arm_matrix_instance_f32 * a,
arm_matrix_instance_f32 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k,n,cols;
n = dst->numRows;
cols = dst->numCols;
float32_t *pX = dst->pData;
float32_t *pUT = ut->pData;
float32_t *pA = a->pData;
float32_t *ut_row;
float32_t *a_col;
float32_t invUT;
f32x4_t vecA;
f32x4_t vecX;
for(i=n-1; i >= 0 ; i--)
{
for(j=0; j+3 < cols; j +=4)
{
vecA = vld1q_f32(&pA[i * cols + j]);
for(k=n-1; k > i; k--)
{
vecX = vld1q_f32(&pX[cols*k+j]);
vecA = vfmsq_f32(vecA,vdupq_n_f32(pUT[n*i + k]),vecX);
}
if (pUT[n*i + i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
invUT = 1.0f / pUT[n*i + i];
vecA = vmulq_f32(vecA,vdupq_n_f32(invUT));
vst1q_f32(&pX[i*cols+j],vecA);
}
for(; j < cols; j ++)
{
a_col = &pA[j];
ut_row = &pUT[n*i];
float32_t tmp=a_col[i * cols];
for(k=n-1; k > i; k--)
{
tmp -= ut_row[k] * pX[cols*k+j];
}
if (ut_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_solve_upper_triangular_f32(
const arm_matrix_instance_f32 * ut,
const arm_matrix_instance_f32 * a,
arm_matrix_instance_f32 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k,n,cols;
float32_t *pX = dst->pData;
float32_t *pUT = ut->pData;
float32_t *pA = a->pData;
float32_t *ut_row;
float32_t *a_col;
n = dst->numRows;
cols = dst->numCols;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=n-1; i >= 0 ; i--)
{
float32_t tmp=a_col[i * cols];
ut_row = &pUT[n*i];
for(k=n-1; k > i; k--)
{
tmp -= ut_row[k] * pX[cols*k+j];
}
if (ut_row[i]==0.0f)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixInv group
*/

View File

@@ -0,0 +1,120 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_solve_upper_triangular_f64.c
* Description: Solve linear system UT X = A with UT upper triangular matrix
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixInv
@{
*/
/**
* @brief Solve UT . X = A where UT is an upper triangular matrix
* @param[in] ut The upper triangular matrix
* @param[in] a The matrix a
* @param[out] dst The solution X of UT . X = A
* @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
*/
arm_status arm_mat_solve_upper_triangular_f64(
const arm_matrix_instance_f64 * ut,
const arm_matrix_instance_f64 * a,
arm_matrix_instance_f64 * dst)
{
arm_status status; /* status of matrix inverse */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((ut->numRows != ut->numCols) ||
(ut->numRows != a->numRows) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
int i,j,k,n,cols;
float64_t *pX = dst->pData;
float64_t *pUT = ut->pData;
float64_t *pA = a->pData;
float64_t *ut_row;
float64_t *a_col;
n = dst->numRows;
cols = dst->numCols;
for(j=0; j < cols; j ++)
{
a_col = &pA[j];
for(i=n-1; i >= 0 ; i--)
{
float64_t tmp=a_col[i * cols];
ut_row = &pUT[n*i];
for(k=n-1; k > i; k--)
{
tmp -= ut_row[k] * pX[cols*k+j];
}
if (ut_row[i]==0.0)
{
return(ARM_MATH_SINGULAR);
}
tmp = tmp / ut_row[i];
pX[i*cols+j] = tmp;
}
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixInv group
*/

View File

@@ -0,0 +1,215 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_sub_f16.c
* Description: Floating-point matrix subtraction
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions_f16.h"
#if defined(ARM_FLOAT16_SUPPORTED)
/**
@ingroup groupMatrix
*/
/**
@addtogroup MatrixSub
@{
*/
/**
@brief Floating-point matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_sub_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
arm_status status; /* status of matrix subtraction */
uint32_t numSamples; /* total number of elements in the matrix */
float16_t *pDataA, *pDataB, *pDataDst;
f16x8_t vecA, vecB, vecDst;
float16_t const *pSrcAVec;
float16_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (float16_t const *) pDataA;
pSrcBVec = (float16_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* sub and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 8;
vecB = vld1q(pSrcBVec);
pSrcBVec += 8;
vecDst = vsubq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
vecDst = vsubq_m(vecDst, vecA, vecB, p0);
vstrhq_p(pDataDst, vecDst, p0);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_sub_f16(
const arm_matrix_instance_f16 * pSrcA,
const arm_matrix_instance_f16 * pSrcB,
arm_matrix_instance_f16 * pDst)
{
float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float16_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (_Float16)(*pInA++) - (_Float16)(*pInB++);
*pOut++ = (_Float16)(*pInA++) - (_Float16)(*pInB++);
*pOut++ = (_Float16)(*pInA++) - (_Float16)(*pInB++);
*pOut++ = (_Float16)(*pInA++) - (_Float16)(*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (_Float16)(*pInA++) - (_Float16)(*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of MatrixSub group
*/
#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_sub_f32.c
* Description: Floating-point matrix subtraction
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,37 +26,112 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @defgroup MatrixSub Matrix Subtraction
*
* Subtract two matrices.
* \image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
*
* The functions check to make sure that
* <code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
* number of rows and columns.
@defgroup MatrixSub Matrix Subtraction
Subtract two matrices.
\image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
The functions check to make sure that
<code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
number of rows and columns.
*/
/**
* @addtogroup MatrixSub
* @{
@addtogroup MatrixSub
@{
*/
/**
* @brief Floating-point matrix subtraction
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
@brief Floating-point matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
arm_status status; /* status of matrix subtraction */
uint32_t numSamples; /* total number of elements in the matrix */
float32_t *pDataA, *pDataB, *pDataDst;
f32x4_t vecA, vecB, vecDst;
float32_t const *pSrcAVec;
float32_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (float32_t const *) pDataA;
pSrcBVec = (float32_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
blkCnt = numSamples >> 2;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) + B(m,n) */
/* sub and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec);
pSrcAVec += 4;
vecB = vld1q(pSrcBVec);
pSrcBVec += 4;
vecDst = vsubq(vecA, vecB);
vst1q(pDataDst, vecDst);
pDataDst += 4;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/*
* tail
* (will be merged thru tail predication)
*/
blkCnt = numSamples & 3;
if (blkCnt > 0U)
{
mve_pred16_t p0 = vctp32q(blkCnt);
vecA = vld1q(pSrcAVec);
vecB = vld1q(pSrcBVec);
vecDst = vsubq_m(vecDst, vecA, vecB, p0);
vstrwq_p(pDataDst, vecDst, p0);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
#if defined(ARM_MATH_NEON)
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
@@ -66,11 +141,6 @@ arm_status arm_mat_sub_f32(
float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
#if defined (ARM_MATH_DSP)
float32_t inA1, inA2, inB1, inB2, out1, out2; /* temporary variables */
#endif // #if defined (ARM_MATH_DSP)
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
@@ -88,71 +158,28 @@ arm_status arm_mat_sub_f32(
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
float32x4_t vec1;
float32x4_t vec2;
float32x4_t res;
/* Total number of samples in the input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Loop Unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
/* Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and then store the results in the destination buffer. */
/* Read values from source A */
inA1 = pIn1[0];
vec1 = vld1q_f32(pIn1);
vec2 = vld1q_f32(pIn2);
res = vsubq_f32(vec1, vec2);
vst1q_f32(pOut, res);
/* Read values from source B */
inB1 = pIn2[0];
/* Read values from source A */
inA2 = pIn1[1];
/* out = sourceA - sourceB */
out1 = inA1 - inB1;
/* Read values from source B */
inB2 = pIn2[1];
/* Read values from source A */
inA1 = pIn1[2];
/* out = sourceA - sourceB */
out2 = inA2 - inB2;
/* Read values from source B */
inB1 = pIn2[2];
/* Store result in destination */
pOut[0] = out1;
pOut[1] = out2;
/* Read values from source A */
inA2 = pIn1[3];
/* Read values from source B */
inB2 = pIn2[3];
/* out = sourceA - sourceB */
out1 = inA1 - inB1;
/* out = sourceA - sourceB */
out2 = inA2 - inB2;
/* Store result in destination */
pOut[2] = out1;
/* Store result in destination */
pOut[3] = out2;
/* update pointers to process next sampels */
/* Update pointers to process next samples */
pIn1 += 4U;
pIn2 += 4U;
pOut += 4U;
@@ -165,14 +192,6 @@ arm_status arm_mat_sub_f32(
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
#else
/* Run the below code for Cortex-M0 */
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_DSP) */
while (blkCnt > 0U)
{
@@ -191,7 +210,89 @@ arm_status arm_mat_sub_f32(
/* Return to application */
return (status);
}
#else
arm_status arm_mat_sub_f32(
const arm_matrix_instance_f32 * pSrcA,
const arm_matrix_instance_f32 * pSrcB,
arm_matrix_instance_f32 * pDst)
{
float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float32_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#endif /* #if defined(ARM_MATH_NEON) */
#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
* @} end of MatrixSub group
@} end of MatrixSub group
*/

View File

@@ -0,0 +1,143 @@
/* ----------------------------------------------------------------------
* Project: CMSIS DSP Library
* Title: arm_mat_sub_f64.c
* Description: Floating-point matrix subtraction
*
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dsp/matrix_functions.h"
/**
@ingroup groupMatrix
*/
/**
@defgroup MatrixSub Matrix Subtraction
Subtract two matrices.
\image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
The functions check to make sure that
<code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
number of rows and columns.
*/
/**
@addtogroup MatrixSub
@{
*/
/**
@brief Floating-point matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
*/
arm_status arm_mat_sub_f64(
const arm_matrix_instance_f64 * pSrcA,
const arm_matrix_instance_f64 * pSrcB,
arm_matrix_instance_f64 * pDst)
{
float64_t *pInA = pSrcA->pData; /* input data matrix pointer A */
float64_t *pInB = pSrcB->pData; /* input data matrix pointer B */
float64_t *pOut = pDst->pData; /* output data matrix pointer */
uint64_t numSamples; /* total number of elements in the matrix */
uint64_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint64_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and store result in destination buffer. */
*pOut++ = (*pInA++) - (*pInB++);
/* Decrement loop counter */
blkCnt--;
}
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
/**
@} end of MatrixSub group
*/

View File

@@ -3,13 +3,13 @@
* Title: arm_mat_sub_q15.c
* Description: Q15 Matrix subtraction
*
* $Date: 27. January 2017
* $Revision: V.1.5.1
* $Date: 23 April 2021
* $Revision: V1.9.0
*
* Target Processor: Cortex-M cores
* Target Processor: Cortex-M and Cortex-A cores
* -------------------------------------------------------------------- */
/*
* Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
* Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -26,115 +26,185 @@
* limitations under the License.
*/
#include "arm_math.h"
#include "dsp/matrix_functions.h"
/**
* @ingroup groupMatrix
@ingroup groupMatrix
*/
/**
* @addtogroup MatrixSub
* @{
@addtogroup MatrixSub
@{
*/
/**
* @brief Q15 matrix subtraction.
* @param[in] *pSrcA points to the first input matrix structure
* @param[in] *pSrcB points to the second input matrix structure
* @param[out] *pDst points to output matrix structure
* @return The function returns either
* <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
*
* <b>Scaling and Overflow Behavior:</b>
* \par
* The function uses saturating arithmetic.
* Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
@brief Q15 matrix subtraction.
@param[in] pSrcA points to the first input matrix structure
@param[in] pSrcB points to the second input matrix structure
@param[out] pDst points to output matrix structure
@return execution status
- \ref ARM_MATH_SUCCESS : Operation successful
- \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
@par Scaling and Overflow Behavior
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_status arm_mat_sub_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
arm_matrix_instance_q15 * pDst)
{
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
uint32_t numSamples; /* total number of elements in the matrix */
q15_t *pDataA, *pDataB, *pDataDst;
q15x8_t vecA, vecB, vecDst;
q15_t const *pSrcAVec;
q15_t const *pSrcBVec;
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
pDataA = pSrcA->pData;
pDataB = pSrcB->pData;
pDataDst = pDst->pData;
pSrcAVec = (q15_t const *) pDataA;
pSrcBVec = (q15_t const *) pDataB;
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in the input matrix */
/*
* Total number of samples in the input matrix
*/
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_DSP)
/* Run the below code for Cortex-M4 and Cortex-M3 */
/* Apply loop unrolling */
blkCnt = numSamples >> 2U;
/* First part of the processing with loop unrolling. Compute 4 outputs at a time.
** a second loop below computes the remaining 1 to 3 samples. */
blkCnt = numSamples >> 3;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract, Saturate and then store the results in the destination buffer. */
*__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
*__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
/* Decrement the loop counter */
blkCnt--;
/* C(m,n) = A(m,n) + B(m,n) */
/* sub and then store the results in the destination buffer. */
vecA = vld1q(pSrcAVec); pSrcAVec += 8;
vecB = vld1q(pSrcBVec); pSrcBVec += 8;
vecDst = vqsubq(vecA, vecB);
vst1q(pDataDst, vecDst); pDataDst += 8;
/*
* Decrement the blockSize loop counter
*/
blkCnt--;
}
/* If the blockSize is not a multiple of 4, compute any remaining output samples here.
** No loop unrolling is used. */
blkCnt = numSamples % 0x4U;
while (blkCnt > 0U)
/*
* tail
*/
blkCnt = numSamples & 7;
if (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and then store the results in the destination buffer. */
*pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++);
/* Decrement the loop counter */
blkCnt--;
mve_pred16_t p0 = vctp16q(blkCnt);
vecA = vld1q(pSrcAVec); pSrcAVec += 8;
vecB = vld1q(pSrcBVec); pSrcBVec += 8;
vecDst = vqsubq_m(vecDst, vecA, vecB, p0);
vstrhq_p(pDataDst, vecDst, p0);
}
status = ARM_MATH_SUCCESS;
}
/* Return to application */
return (status);
}
#else
arm_status arm_mat_sub_q15(
const arm_matrix_instance_q15 * pSrcA,
const arm_matrix_instance_q15 * pSrcB,
arm_matrix_instance_q15 * pDst)
{
q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
q15_t *pOut = pDst->pData; /* output data matrix pointer */
/* Run the below code for Cortex-M0 */
uint32_t numSamples; /* total number of elements in the matrix */
uint32_t blkCnt; /* loop counters */
arm_status status; /* status of matrix subtraction */
#ifdef ARM_MATH_MATRIX_CHECK
/* Check for matrix mismatch condition */
if ((pSrcA->numRows != pSrcB->numRows) ||
(pSrcA->numCols != pSrcB->numCols) ||
(pSrcA->numRows != pDst->numRows) ||
(pSrcA->numCols != pDst->numCols) )
{
/* Set status as ARM_MATH_SIZE_MISMATCH */
status = ARM_MATH_SIZE_MISMATCH;
}
else
#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
{
/* Total number of samples in input matrix */
numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
#if defined (ARM_MATH_LOOPUNROLL)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract, Saturate and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
write_q15x2_ia (&pOut, __QSUB16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
write_q15x2_ia (&pOut, __QSUB16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
#else
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
/* Loop unrolling: Compute remaining outputs */
blkCnt = numSamples % 0x4U;
#else
/* Initialize blkCnt with number of samples */
blkCnt = numSamples;
#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
while (blkCnt > 0U)
{
/* C(m,n) = A(m,n) - B(m,n) */
/* Subtract and then store the results in the destination buffer. */
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
/* Decrement the loop counter */
/* Subtract and store result in destination buffer. */
#if defined (ARM_MATH_DSP)
*pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++);
#else
*pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
#endif
/* Decrement loop counter */
blkCnt--;
}
#endif /* #if defined (ARM_MATH_DSP) */
/* Set status as ARM_MATH_SUCCESS */
status = ARM_MATH_SUCCESS;
}
@@ -142,7 +212,8 @@ arm_status arm_mat_sub_q15(
/* Return to application */
return (status);
}
#endif /* defined(ARM_MATH_MVEI) */
/**
* @} end of MatrixSub group
@} end of MatrixSub group
*/

Some files were not shown because too many files have changed in this diff Show More