4 files changed, 137 insertions, 8 deletions
diff --git a/gr-filter/include/filter/fir_filter.h b/gr-filter/include/filter/fir_filter.h
index 3ce21d304..8bfaa4f50 100644
--- a/gr-filter/include/filter/fir_filter.h
+++ b/gr-filter/include/filter/fir_filter.h
@@ -84,7 +84,7 @@ namespace gr {
 
       private:
 	unsigned int d_ntaps;
-	gr_complex  *d_taps;
+	float       *d_taps;
 	float      **d_aligned_taps;
 	gr_complex  *d_output;
 	int          d_align;
diff --git a/gr-filter/lib/fir_filter.cc b/gr-filter/lib/fir_filter.cc
index 973be45d4..18568da9d 100644
--- a/gr-filter/lib/fir_filter.cc
+++ b/gr-filter/lib/fir_filter.cc
@@ -145,8 +145,14 @@ namespace gr {
       fir_filter_ccf::fir_filter_ccf(int decimation,
 				     const std::vector<float> &taps)
       {
+	d_align = volk_get_alignment();
+	d_naligned = d_align / sizeof(gr_complex);
+
 	d_taps = NULL;
 	set_taps(taps);
+
+	// Make sure the output sample is always aligned, too.
+	d_output = fft::malloc_complex(1);
       }
       
       fir_filter_ccf::~fir_filter_ccf()
@@ -167,9 +173,17 @@ namespace gr {
 	}
 	
 	d_ntaps = (int)taps.size();
-	d_taps = fft::malloc_complex(d_ntaps);
+	d_taps = fft::malloc_float(d_ntaps);
 	for(unsigned int i = 0; i < d_ntaps; i++) {
-	  d_taps[d_ntaps-i-1] = gr_complex(taps[i],0);
+	  d_taps[d_ntaps-i-1] = taps[i];
+	}
+
+	// Make a set of taps at all possible arch alignments
+	d_aligned_taps = (float**)malloc(d_naligned*sizeof(float**));
+	for(int i = 0; i < d_naligned; i++) {
+	  d_aligned_taps[i] = fft::malloc_float(d_ntaps+d_naligned-1);
+	  memset(d_aligned_taps[i], 0, sizeof(float)*(d_ntaps+d_naligned-1));
+	  memcpy(&d_aligned_taps[i][i], d_taps, sizeof(float)*(d_ntaps));
 	}
       }
       
@@ -178,7 +192,7 @@ namespace gr {
       {
 	std::vector<float> t;
 	for(unsigned int i = 0; i < d_ntaps; i++)
-	  t.push_back(d_taps[d_ntaps-i-1].real());
+	  t.push_back(d_taps[d_ntaps-i-1]);
 	return t;
       }
 
@@ -191,9 +205,13 @@ namespace gr {
       gr_complex
       fir_filter_ccf::filter(const gr_complex input[])
       {
-	gr_complex output;
-	volk_32fc_x2_dot_prod_32fc_u(&output, input, d_taps, d_ntaps);
-	return output;
+	const gr_complex *ar = (gr_complex *)((unsigned long) input & ~(d_align-1));
+	unsigned al = input - ar;
+
+	volk_32fc_32f_dot_prod_32fc_a(d_output, ar,
+				      d_aligned_taps[al],
+				      (d_ntaps+al));
+	return *d_output;
       }
       
       void
diff --git a/gr-filter/lib/fir_filter_XXX_impl.cc.t b/gr-filter/lib/fir_filter_XXX_impl.cc.t
index 18bec38be..8ea8151f8 100644
--- a/gr-filter/lib/fir_filter_XXX_impl.cc.t
+++ b/gr-filter/lib/fir_filter_XXX_impl.cc.t
@@ -40,7 +40,7 @@ namespace gr {
 
 
     @IMPL_NAME@::@IMPL_NAME@(int decimation, const std::vector<@TAP_TYPE@> &taps)
-      : gr_sync_decimator("fir_filter_fff",
+      : gr_sync_decimator("@BASE_NAME@",
 			  gr_make_io_signature(1, 1, sizeof(@I_TYPE@)),
 			  gr_make_io_signature(1, 1, sizeof(@O_TYPE@)),
 			  decimation)
diff --git a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h
new file mode 100644
index 000000000..109b787e8
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h
@@ -0,0 +1,111 @@
+#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
+
+  float res[2];
+  float *realpt = &res[0], *imagpt = &res[1];
+  const float* aPtr = (float*)input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  *realpt = 0;
+  *imagpt = 0;
+
+  for(number = 0; number < num_points; number++){
+    *realpt += ((*aPtr++) * (*bPtr));
+    *imagpt += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points) {
+
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 8;
+
+  float res[2];
+  float *realpt = &res[0], *imagpt = &res[1];
+  const float* aPtr = (float*)input;
+  const float* bPtr = taps;
+
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 x0Val, x1Val, x2Val, x3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
+
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+
+    x0Val = _mm_load_ps(bPtr);
+    x1Val = _mm_load_ps(bPtr);
+    x2Val = _mm_load_ps(bPtr+4);
+    x3Val = _mm_load_ps(bPtr+4);
+    b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+    b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+    b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+    b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+    aPtr += 16;
+    bPtr += 8;
+  }
+
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  *realpt = dotProductVector[0];
+  *imagpt = dotProductVector[1];
+  *realpt += dotProductVector[2];
+  *imagpt += dotProductVector[3];
+
+  number = sixteenthPoints*8;
+  for(;number < num_points; number++){
+    *realpt += ((*aPtr++) * (*bPtr));
+    *imagpt += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H*/