4 files changed, 9 insertions, 40 deletions
diff --git a/gr-filter/lib/qa_fir_filter_with_buffer.cc b/gr-filter/lib/qa_fir_filter_with_buffer.cc
index f8af9ea0b..eaa0359d0 100644
--- a/gr-filter/lib/qa_fir_filter_with_buffer.cc
+++ b/gr-filter/lib/qa_fir_filter_with_buffer.cc
@@ -39,8 +39,6 @@ namespace gr {
 #define MAX_DATA        (16383)
 #define	ERR_DELTA	(1e-5)
 
-#define	NELEM(x) (sizeof(x) / sizeof(x[0]))
-
     static float
     uniform() 
     {
@@ -58,8 +56,8 @@ namespace gr {
     random_complex(gr_complex *buf, unsigned n)
     {
       for(unsigned i = 0; i < n; i++) {
-	float re = rint(uniform () * MAX_DATA);
-	float im = rint(uniform () * MAX_DATA);
+	float re = rint(uniform() * MAX_DATA);
+	float im = rint(uniform() * MAX_DATA);
 	buf[i] = gr_complex(re, im);
       }
     }
@@ -263,7 +261,7 @@ namespace gr {
 	      new kernel::fir_filter_with_buffer_ccc(f1_taps);
 
 	    // zero the output, then do the filtering
-	    memset(actual_output, 0, sizeof(actual_output));
+	    memset(actual_output, 0, OUTPUT_LEN*sizeof(o_type));
 	    f1->filterNdec(actual_output, input, ol/decimate, decimate);
 
 	    // check results
diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
index 80c55e75f..05732b1ea 100644
--- a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
+++ b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
@@ -174,12 +174,10 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
     printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
     printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
     printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
-    __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p, negated, zeros;
+    __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
     
     phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
     inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
-    zeros = _mm256_set1_ps(0.0);
-    negated = _mm256_set1_ps(-1.0);
     const unsigned int fourthPoints = num_points / 4;
 
     
diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
index cb2ac4c67..166a883a7 100644
--- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
+++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
@@ -18,40 +18,26 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const
   unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
   unsigned int isodd = (num_bytes >> 3) &1;
 
-
-
   float sum0[2] = {0,0};
   float sum1[2] = {0,0};
   unsigned int i = 0;
 
-
   for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-
-
     sum0[0] += in[0] * tp[0] - in[1] * tp[1];
     sum0[1] += in[0] * tp[1] + in[1] * tp[0];
     sum1[0] += in[2] * tp[2] - in[3] * tp[3];
     sum1[1] += in[2] * tp[3] + in[3] * tp[2];
 
-
     in += 4;
     tp += 4;
-
   }
 
-
   res[0] = sum0[0] + sum1[0];
   res[1] = sum0[1] + sum1[1];
 
-
-
   for(i = 0; i < isodd; ++i) {
-
-
     *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1];
-
   }
-
 }
 
 #endif /*LV_HAVE_GENERIC*/
@@ -177,14 +163,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const
      );
 
 
-  int getem = num_bytes % 16;
-
-
-  for(; getem > 0; getem -= 8) {
-
-
+  if(((num_bytes >> 3) & 1)) {
     *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
-
   }
 
   return;
@@ -363,7 +343,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv
 
   dotProduct += ( dotProductVector[0] + dotProductVector[1] );
 
-  if((num_bytes >> 2) != 0) {
+  if(((num_bytes >> 3) & 1) != 0) {
     dotProduct += (*a) * (*b);
   }
 
@@ -377,9 +357,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv
 #include <smmintrin.h>
 
 static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  volk_32fc_x2_dot_prod_32fc_a_sse3(result, input, taps, num_bytes);
-  // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now
-   /*
+
     __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
     float *p_input, *p_taps;
     __m64 *p_result;
@@ -442,12 +420,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
 
     }
 
-
-
-
     real1 = _mm_xor_ps(real1, (__m128)neg);
 
-
     im0 = _mm_add_ps(im0, im1);
     real0 = _mm_add_ps(real0, real1);
 
@@ -459,7 +433,6 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
 
     *result += input[i] * taps[i];
     }
-  */
 }
 
 #endif /*LV_HAVE_SSE4_1*/
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 813e62217..d1eb1cacb 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -35,8 +35,8 @@ VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1);
 VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
-VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 2046000, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 2046000, 1);
 VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1);
 VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1);