volk: dot_produce for floats does 16 at a time.

This was done to make this have the same performance as float_dotprod from before. This makes all flavors of the 32f dotprod work the same way. Because it's expecting the input to have 4x more samples than specified, it's making qa for these fail.
author: Tom Rondeau 2012-06-13 17:49:44 -0400
committer: Tom Rondeau 2012-06-13 17:49:44 -0400
commit: 7f9f0fc96a3fbfe297b0a5cb18d922bb74fdc34d (patch)
tree: 596eea7cd72a83df1c5578bfc55d5296f580a1ff
parent: 1b93bb39731517e6d7bb8a78ea9569995e8a371a (diff)
download: gnuradio-7f9f0fc96a3fbfe297b0a5cb18d922bb74fdc34d.tar.gz
gnuradio-7f9f0fc96a3fbfe297b0a5cb18d922bb74fdc34d.tar.bz2
gnuradio-7f9f0fc96a3fbfe297b0a5cb18d922bb74fdc34d.zip
2 files changed, 160 insertions, 75 deletions
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
index 8753ff615..0543227b0 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
@@ -8,14 +8,17 @@
 #ifdef LV_HAVE_GENERIC
 
 
-static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
 
   float dotProduct = 0;
   const float* aPtr = input;
   const float* bPtr=  taps;
   unsigned int number = 0;
 
-  for(number = 0; number < num_points; number++){
+  for(number = 0; number < num_4x_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
 
@@ -28,10 +31,10 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa
 #ifdef LV_HAVE_SSE
 
 
-static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float* input, const  float* taps, unsigned int num_4x_points) {
 
   unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+  const unsigned int quarterPoints = num_4x_points / 4;
 
   float dotProduct = 0;
   const float* aPtr = input;
@@ -67,8 +70,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float*
     dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
     dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
 
-    aPtr += 4*4;
-    bPtr += 4*4;
+    aPtr += 16;
+    bPtr += 16;
   }
 
   dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
@@ -84,8 +87,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float*
   dotProduct += dotProductVector[2];
   dotProduct += dotProductVector[3];
 
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
+  number = quarterPoints*4;
+  for(;number < num_4x_points; number++){
     dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
@@ -102,41 +105,65 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float*
 
 #include <pmmintrin.h>
 
-static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
   unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+  const unsigned int quarterPoints = num_4x_points / 4;
 
   float dotProduct = 0;
   const float* aPtr = input;
   const float* bPtr = taps;
 
-  __m128 aVal, bVal, cVal;
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
 
-  __m128 dotProdVal = _mm_setzero_ps();
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
 
   for(;number < quarterPoints; number++){
 
-    aVal = _mm_load_ps(aPtr);
-    bVal = _mm_load_ps(bPtr);
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
 
-    cVal = _mm_mul_ps(aVal, bVal);
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
 
-    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+    dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+    dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+    dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
 
-    aPtr += 4;
-    bPtr += 4;
+    aPtr += 16;
+    bPtr += 16;
   }
 
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
 
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
 
   dotProduct = dotProductVector[0];
   dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
 
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
+  number = quarterPoints*4;
+  for(;number < num_4x_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
 
@@ -149,9 +176,9 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float *
 
 #include <smmintrin.h>
 
-static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_4x_points) {
   unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+  const unsigned int sixteenthPoints = num_4x_points / 4;
 
   float dotProduct = 0;
   const float* aPtr = input;
@@ -196,8 +223,11 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float
   dotProduct += dotProductVector[2];
   dotProduct += dotProductVector[3];
 
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
+  number = sixteenthPoints * 4;
+  for(;number < num_4x_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
 
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
index 3b7284b57..dfafe2239 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -1,20 +1,24 @@
 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
 
+#include <volk/volk_common.h>
 #include<stdio.h>
 
 
 #ifdef LV_HAVE_GENERIC
 
 
-static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
 
   float dotProduct = 0;
   const float* aPtr = input;
   const float* bPtr=  taps;
   unsigned int number = 0;
 
-  for(number = 0; number < num_points; number++){
+  for(number = 0; number < num_4x_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
 
@@ -27,43 +31,67 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa
 #ifdef LV_HAVE_SSE
 
 
-static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float* input, const  float* taps, unsigned int num_4x_points) {
 
   unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+  const unsigned int quarterPoints = num_4x_points / 4;
 
   float dotProduct = 0;
   const float* aPtr = input;
   const float* bPtr = taps;
 
-  __m128 aVal, bVal, cVal;
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
 
-  __m128 dotProdVal = _mm_setzero_ps();
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
 
   for(;number < quarterPoints; number++){
 
-    aVal = _mm_loadu_ps(aPtr);
-    bVal = _mm_loadu_ps(bPtr);
-
-    cVal = _mm_mul_ps(aVal, bVal);
-
-    dotProdVal = _mm_add_ps(cVal, dotProdVal);
-
-    aPtr += 4;
-    bPtr += 4;
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+    aPtr += 16;
+    bPtr += 16;
   }
 
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
 
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
 
   dotProduct = dotProductVector[0];
   dotProduct += dotProductVector[1];
   dotProduct += dotProductVector[2];
   dotProduct += dotProductVector[3];
 
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
+  number = quarterPoints*4;
+  for(;number < num_4x_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
 
@@ -77,41 +105,65 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float*
 
 #include <pmmintrin.h>
 
-static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
   unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+  const unsigned int quarterPoints = num_4x_points / 4;
 
   float dotProduct = 0;
   const float* aPtr = input;
   const float* bPtr = taps;
 
-  __m128 aVal, bVal, cVal;
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
 
-  __m128 dotProdVal = _mm_setzero_ps();
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
 
   for(;number < quarterPoints; number++){
 
-    aVal = _mm_loadu_ps(aPtr);
-    bVal = _mm_loadu_ps(bPtr);
-
-    cVal = _mm_mul_ps(aVal, bVal);
-
-    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
-
-    aPtr += 4;
-    bPtr += 4;
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+    dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+    dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+    dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+    aPtr += 16;
+    bPtr += 16;
   }
 
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
 
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
 
   dotProduct = dotProductVector[0];
   dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
 
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
+  number = quarterPoints*4;
+  for(;number < num_4x_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
 
@@ -124,9 +176,9 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float *
 
 #include <smmintrin.h>
 
-static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_4x_points) {
   unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
+  const unsigned int sixteenthPoints = num_4x_points / 4;
 
   float dotProduct = 0;
   const float* aPtr = input;
@@ -141,15 +193,15 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
 
   for(;number < sixteenthPoints; number++){
 
-    aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal4 = _mm_load_ps(aPtr); aPtr += 4;
 
-    bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal4 = _mm_load_ps(bPtr); bPtr += 4;
 
     cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
     cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
@@ -171,8 +223,11 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
   dotProduct += dotProductVector[2];
   dotProduct += dotProductVector[3];
 
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
+  number = sixteenthPoints * 4;
+  for(;number < num_4x_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
+    dotProduct += ((*aPtr++) * (*bPtr++));
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
author	Tom Rondeau	2012-06-13 17:49:44 -0400
committer	Tom Rondeau	2012-06-13 17:49:44 -0400
commit	7f9f0fc96a3fbfe297b0a5cb18d922bb74fdc34d (patch)
tree	596eea7cd72a83df1c5578bfc55d5296f580a1ff
parent	1b93bb39731517e6d7bb8a78ea9569995e8a371a (diff)
download	gnuradio-7f9f0fc96a3fbfe297b0a5cb18d922bb74fdc34d.tar.gz gnuradio-7f9f0fc96a3fbfe297b0a5cb18d922bb74fdc34d.tar.bz2 gnuradio-7f9f0fc96a3fbfe297b0a5cb18d922bb74fdc34d.zip