diff options
-rw-r--r-- | volk/include/volk/volk_32f_x2_multiply_32f_a16.h | 39 |
1 files changed, 39 insertions, 0 deletions
diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h index cef17f5a6..885941abf 100644 --- a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h +++ b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h @@ -43,6 +43,45 @@ static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Multiplies the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_a16_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eighthPoints; number++){ + + aVal = _mm256_load_ps(aPtr); + bVal = _mm256_load_ps(bPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_GENERIC /*! \brief Multiplys the two input vectors and store their results in the third vector |