2 files changed, 160 insertions, 75 deletions
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
index 8753ff615..0543227b0 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
@@ -8,14 +8,17 @@
 #ifdef LV_HAVE_GENERIC
 
 
-static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
 
   float dotProduct = 0;
   const float* aPtr = input;
   const float* bPtr=  taps;
   unsigned int number = 0;
 
-  for(number = 0; number < num_points; number++){
+  for(number = 0; number < num_4x_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
 
@@ -28,10 +31,10 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa
 #ifdef LV_HAVE_SSE
 
 
-static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float* input, const  float* taps, unsigned int num_4x_points) {
 
   unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+  const unsigned int quarterPoints = num_4x_points / 4;
 
   float dotProduct = 0;
   const float* aPtr = input;
@@ -67,8 +70,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float*
     dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
     dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
 
-    aPtr += 4*4;
-    bPtr += 4*4;
+    aPtr += 16;
+    bPtr += 16;
   }
 
   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
@@ -84,8 +87,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float*
   dotProduct += dotProductVector[2];
   dotProduct += dotProductVector[3];
 
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
+  number = quarterPoints*4;
+  for(;number < num_4x_points; number++){
     dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
@@ -102,41 +105,65 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float*
 
 #include <pmmintrin.h>
 
-static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
   unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+  const unsigned int quarterPoints = num_4x_points / 4;
 
   float dotProduct = 0;
   const float* aPtr = input;
   const float* bPtr = taps;
 
-  __m128 aVal, bVal, cVal;
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
 
-  __m128 dotProdVal = _mm_setzero_ps();
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
 
   for(;number < quarterPoints; number++){
 
-    aVal = _mm_load_ps(aPtr);
-    bVal = _mm_load_ps(bPtr);
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
 
-    cVal = _mm_mul_ps(aVal, bVal);
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
 
-    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+    dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+    dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+    dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
 
-    aPtr += 4;
-    bPtr += 4;
+    aPtr += 16;
+    bPtr += 16;
   }
 
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
 
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
 
   dotProduct = dotProductVector[0];
   dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
 
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
+  number = quarterPoints*4;
+  for(;number < num_4x_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
 
@@ -149,9 +176,9 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float *
 
 #include <smmintrin.h>
 
-static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_4x_points) {
   unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+  const unsigned int sixteenthPoints = num_4x_points / 4;
 
   float dotProduct = 0;
   const float* aPtr = input;
@@ -196,8 +223,11 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float
   dotProduct += dotProductVector[2];
   dotProduct += dotProductVector[3];
 
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
+  number = sixteenthPoints * 4;
+  for(;number < num_4x_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
 
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
index 3b7284b57..dfafe2239 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -1,20 +1,24 @@
 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
 
+#include <volk/volk_common.h>
 #include<stdio.h>
 
 
 #ifdef LV_HAVE_GENERIC
 
 
-static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
 
   float dotProduct = 0;
   const float* aPtr = input;
   const float* bPtr=  taps;
   unsigned int number = 0;
 
-  for(number = 0; number < num_points; number++){
+  for(number = 0; number < num_4x_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
 
@@ -27,43 +31,67 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa
 #ifdef LV_HAVE_SSE
 
 
-static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float* input, const  float* taps, unsigned int num_4x_points) {
 
   unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+  const unsigned int quarterPoints = num_4x_points / 4;
 
   float dotProduct = 0;
   const float* aPtr = input;
   const float* bPtr = taps;
 
-  __m128 aVal, bVal, cVal;
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
 
-  __m128 dotProdVal = _mm_setzero_ps();
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
 
   for(;number < quarterPoints; number++){
 
-    aVal = _mm_loadu_ps(aPtr);
-    bVal = _mm_loadu_ps(bPtr);
-
-    cVal = _mm_mul_ps(aVal, bVal);
-
-    dotProdVal = _mm_add_ps(cVal, dotProdVal);
-
-    aPtr += 4;
-    bPtr += 4;
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+    aPtr += 16;
+    bPtr += 16;
   }
 
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
 
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
 
   dotProduct = dotProductVector[0];
   dotProduct += dotProductVector[1];
   dotProduct += dotProductVector[2];
   dotProduct += dotProductVector[3];
 
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
+  number = quarterPoints*4;
+  for(;number < num_4x_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
 
@@ -77,41 +105,65 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float*
 
 #include <pmmintrin.h>
 
-static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
   unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+  const unsigned int quarterPoints = num_4x_points / 4;
 
   float dotProduct = 0;
   const float* aPtr = input;
   const float* bPtr = taps;
 
-  __m128 aVal, bVal, cVal;
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
 
-  __m128 dotProdVal = _mm_setzero_ps();
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
 
   for(;number < quarterPoints; number++){
 
-    aVal = _mm_loadu_ps(aPtr);
-    bVal = _mm_loadu_ps(bPtr);
-
-    cVal = _mm_mul_ps(aVal, bVal);
-
-    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
-
-    aPtr += 4;
-    bPtr += 4;
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+    dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+    dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+    dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+    aPtr += 16;
+    bPtr += 16;
   }
 
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
 
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
 
   dotProduct = dotProductVector[0];
   dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
 
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
+  number = quarterPoints*4;
+  for(;number < num_4x_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
 
@@ -124,9 +176,9 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float *
 
 #include <smmintrin.h>
 
-static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_4x_points) {
   unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+  const unsigned int sixteenthPoints = num_4x_points / 4;
 
   float dotProduct = 0;
   const float* aPtr = input;
@@ -141,15 +193,15 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
 
   for(;number < sixteenthPoints; number++){
 
-    aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal4 = _mm_load_ps(aPtr); aPtr += 4;
 
-    bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal4 = _mm_load_ps(bPtr); bPtr += 4;
 
     cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
     cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
@@ -171,8 +223,11 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
   dotProduct += dotProductVector[2];
   dotProduct += dotProductVector[3];
 
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
+  number = sixteenthPoints * 4;
+  for(;number < num_4x_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
   }