1 files changed, 169 insertions, 0 deletions
diff --git a/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h b/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h
new file mode 100644
index 000000000..1cd502257
--- /dev/null
+++ b/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h
@@ -0,0 +1,169 @@
+#ifndef INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H
+#define INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_aligned16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* aPtr = inputBuffer;
+    float meanBuffer[4] __attribute__((aligned(128)));
+    float squareBuffer[4] __attribute__((aligned(128)));
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 squareAccumulator = _mm_setzero_ps();
+    __m128 aVal1, aVal2, aVal3, aVal4;
+    __m128 cVal1, cVal2, cVal3, cVal4;
+    for(;number < sixteenthPoints; number++) {
+      aVal1 = _mm_load_ps(aPtr); aPtr += 4;   
+      cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+      accumulator = _mm_add_ps(accumulator, aVal1);  // accumulator += x
+
+      aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+      accumulator = _mm_add_ps(accumulator, aVal2);  // accumulator += x
+
+      aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+      accumulator = _mm_add_ps(accumulator, aVal3);  // accumulator += x
+
+      aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+      accumulator = _mm_add_ps(accumulator, aVal4);  // accumulator += x
+
+      cVal1 = _mm_or_ps(cVal1, cVal2);
+      cVal3 = _mm_or_ps(cVal3, cVal4);
+      cVal1 = _mm_or_ps(cVal1, cVal3);
+
+      squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+    }
+    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
+    newMean = meanBuffer[0];
+    newMean += meanBuffer[1];
+    newMean += meanBuffer[2];
+    newMean += meanBuffer[3];
+    returnValue = squareBuffer[0];
+    returnValue += squareBuffer[1];
+    returnValue += squareBuffer[2];
+    returnValue += squareBuffer[3];
+  
+    number = sixteenthPoints * 16;
+    for(;number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_aligned16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* aPtr = inputBuffer;
+    float meanBuffer[4] __attribute__((aligned(128)));
+    float squareBuffer[4] __attribute__((aligned(128)));
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 squareAccumulator = _mm_setzero_ps();
+    __m128 aVal = _mm_setzero_ps();
+    for(;number < quarterPoints; number++) {
+      aVal = _mm_load_ps(aPtr);                     // aVal = x
+      accumulator = _mm_add_ps(accumulator, aVal);  // accumulator += x
+      aVal = _mm_mul_ps(aVal, aVal);                // squareAccumulator += x^2
+      squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+      aPtr += 4;
+    }
+    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
+    newMean = meanBuffer[0];
+    newMean += meanBuffer[1];
+    newMean += meanBuffer[2];
+    newMean += meanBuffer[3];
+    returnValue = squareBuffer[0];
+    returnValue += squareBuffer[1];
+    returnValue += squareBuffer[2];
+    returnValue += squareBuffer[3];
+  
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_aligned16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    const float* aPtr = inputBuffer;
+    unsigned int number = 0;
+    
+    for(number = 0; number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H */