1 files changed, 39 insertions, 0 deletions
diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h
index cef17f5a6..885941abf 100644
--- a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h
+++ b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h
@@ -43,6 +43,45 @@ static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float*
 }
 #endif /* LV_HAVE_SSE */
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Multiplies the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_a16_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m256 aVal, bVal, cVal;
+    for(;number < eighthPoints; number++){
+      
+      aVal = _mm256_load_ps(aPtr); 
+      bVal = _mm256_load_ps(bPtr);
+      
+      cVal = _mm256_mul_ps(aVal, bVal); 
+      
+      _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 8;
+      bPtr += 8;
+      cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Multiplys the two input vectors and store their results in the third vector