filter: adding ssc and fsf versions of filter with associated new Volk kernels.

These routines work and pass QA. They could use some performance work. the FSF is just slightly slower than before; the SCC version is more noticably slower. Both could benefit, probably, by using SSE2 intrinsics to handle the shorts.
author: Tom Rondeau 2012-06-15 08:43:20 -0400
committer: Tom Rondeau 2012-06-15 08:43:20 -0400
commit: 5585c71229cfa7886e0bd090828cd1f5104f6b27 (patch)
tree: c80bcc8821fb10a44c073ce1fa2a0f4816027bef
parent: a74286a2aa7fcddb52c165ba2c17cb2f55b5b592 (diff)
download: gnuradio-5585c71229cfa7886e0bd090828cd1f5104f6b27.tar.gz
gnuradio-5585c71229cfa7886e0bd090828cd1f5104f6b27.tar.bz2
gnuradio-5585c71229cfa7886e0bd090828cd1f5104f6b27.zip
8 files changed, 617 insertions, 2 deletions
diff --git a/gr-filter/include/filter/CMakeLists.txt b/gr-filter/include/filter/CMakeLists.txt
index 5b209873c..2620d3f54 100644
--- a/gr-filter/include/filter/CMakeLists.txt
+++ b/gr-filter/include/filter/CMakeLists.txt
@@ -64,7 +64,7 @@ endmacro(expand_h)
 ########################################################################
 # Invoke macro to generate various sources
 #######################################################################
-expand_h(fir_filter_XXX fff ccf ccc)
+expand_h(fir_filter_XXX fff ccf ccc scc fsf)
 
 add_custom_target(filter_generated_includes DEPENDS
     ${generated_includes}
diff --git a/gr-filter/include/filter/fir_filter.h b/gr-filter/include/filter/fir_filter.h
index 8bfaa4f50..1fb3afb4d 100644
--- a/gr-filter/include/filter/fir_filter.h
+++ b/gr-filter/include/filter/fir_filter.h
@@ -122,6 +122,68 @@ namespace gr {
 	int          d_naligned;
       };
 
+      /**************************************************************/
+      
+      class FILTER_API fir_filter_scc
+      {
+      public:
+	fir_filter_scc(int decimation,
+		       const std::vector<gr_complex> &taps);
+	~fir_filter_scc();
+
+	void set_taps(const std::vector<gr_complex> &taps);
+	std::vector<gr_complex> taps() const;
+	unsigned int ntaps() const;
+
+	gr_complex filter(const short input[]);
+	void filterN(gr_complex output[],
+		     const short input[],
+		     unsigned long n);
+	void filterNdec(gr_complex output[],
+			const short input[],
+			unsigned long n,
+			unsigned int decimate);
+
+      private:
+	unsigned int d_ntaps;
+	gr_complex  *d_taps;
+	gr_complex **d_aligned_taps;
+	gr_complex  *d_output;
+	int          d_align;
+	int          d_naligned;
+      };
+
+      /**************************************************************/
+      
+      class FILTER_API fir_filter_fsf
+      {
+      public:
+	fir_filter_fsf(int decimation,
+		       const std::vector<float> &taps);
+	~fir_filter_fsf();
+
+	void set_taps(const std::vector<float> &taps);
+	std::vector<float> taps() const;
+	unsigned int ntaps() const;
+
+	short filter(const float input[]);
+	void filterN(short output[],
+		     const float input[],
+		     unsigned long n);
+	void filterNdec(short output[],
+			const float input[],
+			unsigned long n,
+			unsigned int decimate);
+
+      private:
+	unsigned int d_ntaps;
+	float       *d_taps;
+	float      **d_aligned_taps;
+	short       *d_output;
+	int          d_align;
+	int          d_naligned;
+      };
+
     } /* namespace kernel */
   } /* namespace filter */
 } /* namespace gr */
diff --git a/gr-filter/lib/CMakeLists.txt b/gr-filter/lib/CMakeLists.txt
index b51a23bab..f5dbd1bb3 100644
--- a/gr-filter/lib/CMakeLists.txt
+++ b/gr-filter/lib/CMakeLists.txt
@@ -80,7 +80,7 @@ endmacro(expand_cc)
 ########################################################################
 # Invoke macro to generate various sources
 ########################################################################
-expand_cc(fir_filter_XXX_impl         fff ccf ccc)
+expand_cc(fir_filter_XXX_impl         fff ccf ccc scc fsf)
 
 
 ########################################################################
diff --git a/gr-filter/lib/fir_filter.cc b/gr-filter/lib/fir_filter.cc
index 18568da9d..be8017400 100644
--- a/gr-filter/lib/fir_filter.cc
+++ b/gr-filter/lib/fir_filter.cc
@@ -349,6 +349,238 @@ namespace gr {
 	}
       }
       
+      /**************************************************************/
+
+      fir_filter_scc::fir_filter_scc(int decimation,
+				     const std::vector<gr_complex> &taps)
+      {
+	d_align = volk_get_alignment();
+	d_naligned = d_align / sizeof(short);
+
+	d_taps = NULL;
+	set_taps(taps);
+
+	// Make sure the output sample is always aligned, too.
+	d_output = fft::malloc_complex(1);
+      }
+      
+      fir_filter_scc::~fir_filter_scc()
+      {
+	// Free taps
+	if(d_taps != NULL) {
+	  fft::free(d_taps);
+	  d_taps = NULL;
+	}
+
+	// Free all aligned taps
+	for(int i = 0; i < d_naligned; i++) {
+	  fft::free(d_aligned_taps[i]);
+	}
+	fft::free(d_aligned_taps);
+
+	// Free output sample
+	fft::free(d_output);
+    }
+      
+      void
+      fir_filter_scc::set_taps(const std::vector<gr_complex> &taps)
+      {
+	// Free the taps if already allocated
+	if(d_taps != NULL) {
+	  fft::free(d_taps);
+	  d_taps = NULL;
+
+	  for(int i = 0; i < d_naligned; i++) {
+	    fft::free(d_aligned_taps[i]);
+	  }
+	  fft::free(d_aligned_taps);
+	}
+	
+	d_ntaps = (int)taps.size();
+	d_taps = fft::malloc_complex(d_ntaps);
+	for(unsigned int i = 0; i < d_ntaps; i++) {
+	  d_taps[d_ntaps-i-1] = taps[i];
+	}
+
+	// Make a set of taps at all possible arch alignments
+	d_aligned_taps = (gr_complex**)malloc(d_naligned*sizeof(gr_complex**));
+	for(int i = 0; i < d_naligned; i++) {
+	  d_aligned_taps[i] = fft::malloc_complex(d_ntaps+d_naligned-1);
+	  memset(d_aligned_taps[i], 0, sizeof(gr_complex)*(d_ntaps+d_naligned-1));
+	  memcpy(&d_aligned_taps[i][i], d_taps, sizeof(gr_complex)*(d_ntaps));
+	}
+
+      }
+      
+      std::vector<gr_complex>
+      fir_filter_scc::taps() const
+      {
+	std::vector<gr_complex> t;
+	for(unsigned int i = 0; i < d_ntaps; i++)
+	  t.push_back(d_taps[d_ntaps-i-1]);
+	return t;
+      }
+
+      unsigned int
+      fir_filter_scc::ntaps() const
+      {
+	return d_ntaps;
+      }
+      
+      gr_complex
+      fir_filter_scc::filter(const short input[])
+      {
+	const short *ar = (short *)((unsigned long) input & ~(d_align-1));
+	unsigned al = input - ar;
+
+	volk_16i_32fc_dot_prod_32fc_a(d_output, ar,
+				      d_aligned_taps[al],
+				      (d_ntaps+al));
+
+	return *d_output;
+      }
+      
+      void
+      fir_filter_scc::filterN(gr_complex output[],
+			      const short input[],
+			      unsigned long n)
+      {
+	for(unsigned long i = 0; i < n; i++)
+	  output[i] = filter(&input[i]);
+      }
+      
+      
+      void
+      fir_filter_scc::filterNdec(gr_complex output[],
+				 const short input[],
+				 unsigned long n,
+				 unsigned int decimate)
+      {
+	unsigned long j = 0;
+	for(unsigned long i = 0; i < n; i++){
+	  output[i] = filter(&input[j]);
+	  j += decimate;
+	}
+      }
+
+      /**************************************************************/
+
+      fir_filter_fsf::fir_filter_fsf(int decimation,
+				     const std::vector<float> &taps)
+      {
+	d_align = volk_get_alignment();
+	d_naligned = d_align / sizeof(float);
+
+	d_taps = NULL;
+	set_taps(taps);
+
+	// Make sure the output sample is always aligned, too.
+	d_output = (short*)fft::malloc_float(1);
+      }
+      
+      fir_filter_fsf::~fir_filter_fsf()
+      {
+	// Free taps
+	if(d_taps != NULL) {
+	  fft::free(d_taps);
+	  d_taps = NULL;
+	}
+
+	// Free all aligned taps
+	for(int i = 0; i < d_naligned; i++) {
+	  fft::free(d_aligned_taps[i]);
+	}
+	fft::free(d_aligned_taps);
+
+	// Free output sample
+	fft::free(d_output);
+    }
+      
+      void
+      fir_filter_fsf::set_taps(const std::vector<float> &taps)
+      {
+	// Free the taps if already allocated
+	if(d_taps != NULL) {
+	  fft::free(d_taps);
+	  d_taps = NULL;
+
+	  for(int i = 0; i < d_naligned; i++) {
+	    fft::free(d_aligned_taps[i]);
+	  }
+	  fft::free(d_aligned_taps);
+	}
+	
+	d_ntaps = (int)taps.size();
+	d_taps = fft::malloc_float(d_ntaps);
+	for(unsigned int i = 0; i < d_ntaps; i++) {
+	  d_taps[d_ntaps-i-1] = taps[i];
+	}
+
+	// Make a set of taps at all possible arch alignments
+	d_aligned_taps = (float**)malloc(d_naligned*sizeof(float**));
+	for(int i = 0; i < d_naligned; i++) {
+	  d_aligned_taps[i] = fft::malloc_float(d_ntaps+d_naligned-1);
+	  memset(d_aligned_taps[i], 0, sizeof(float)*(d_ntaps+d_naligned-1));
+	  memcpy(&d_aligned_taps[i][i], d_taps, sizeof(float)*(d_ntaps));
+	}
+      }
+      
+      std::vector<float>
+      fir_filter_fsf::taps() const
+      {
+	std::vector<float> t;
+	for(unsigned int i = 0; i < d_ntaps; i++)
+	  t.push_back(d_taps[d_ntaps-i-1]);
+	return t;
+      }
+
+      unsigned int
+      fir_filter_fsf::ntaps() const
+      {
+	return d_ntaps;
+      }
+      
+      short
+      fir_filter_fsf::filter(const float input[])
+      {
+	const float *ar = (float *)((unsigned long) input & ~(d_align-1));
+	unsigned al = input - ar;
+
+	volk_32f_x2_dot_prod_16i_a(d_output, ar,
+				   d_aligned_taps[al],
+				   (d_ntaps+al));
+
+	//float out = 0;
+	//for(unsigned int i = 0; i < d_ntaps; i++) {
+	//  out += d_taps[i] * input[i];
+	//}
+	//*d_output = (short)out;
+
+	return *d_output;
+      }
+      
+      void
+      fir_filter_fsf::filterN(short output[],
+			      const float input[],
+			      unsigned long n)
+      {
+	for(unsigned long i = 0; i < n; i++)
+	  output[i] = filter(&input[i]);
+      }
+
+      void
+      fir_filter_fsf::filterNdec(short output[],
+				 const float input[],
+				 unsigned long n,
+				 unsigned int decimate)
+      {
+	unsigned long j = 0;
+	for(unsigned long i = 0; i < n; i++){
+	  output[i] = filter(&input[j]);
+	  j += decimate;
+	}
+      }
+
     } /* namespace kernel */
   } /* namespace filter */
 } /* namespace gr */
diff --git a/gr-filter/python/qa_fir_filter.py b/gr-filter/python/qa_fir_filter.py
index ac20286cc..2a61498a2 100755
--- a/gr-filter/python/qa_fir_filter.py
+++ b/gr-filter/python/qa_fir_filter.py
@@ -218,6 +218,101 @@ class test_filter(gr_unittest.TestCase):
         self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5)
 
 
+    def test_fir_filter_scc_001(self):
+        src_data = 40*[1, 2, 3, 4]
+        expected_data = ((0.5+1j), (1.5+3j), (3+6j), (5+10j), (5.5+11j),
+                         (6.5+13j), (8+16j), (10+20j), (10.5+21j), (11.5+23j),
+                         (13+26j), (15+30j), (15.5+31j), (16.5+33j), (18+36j),
+                         (20+40j), (20.5+41j), (21.5+43j), (23+46j), (25+50j),
+                         (25.5+51j), (26.5+53j), (28+56j), (30+60j), (30.5+61j),
+                         (31.5+63j), (33+66j), (35+70j), (35.5+71j), (36.5+73j),
+                         (38+76j), (40+80j), (40.5+81j), (41.5+83j), (43+86j),
+                         (45+90j), (45.5+91j), (46.5+93j), (48+96j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j))
+        src = gr.vector_source_s(src_data)
+        op  = filter.fir_filter_scc(1, 20*[0.5+1j, 0.5+1j])
+        dst = gr.vector_sink_c()
+        self.tb.connect(src, op, dst)
+        self.tb.run()
+        result_data = dst.data()
+        self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5)
+
+
+    def test_fir_filter_scc_002(self):
+        src_data = 40*[1, 2, 3, 4]
+        expected_data = ((0.5+1j), (5.5+11j), (10.5+21j), (15.5+31j), (20.5+41j),
+                         (25.5+51j), (30.5+61j), (35.5+71j), (40.5+81j), (45.5+91j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+                         (50+100j), (50+100j), (50+100j), (50+100j), (50+100j))
+        src = gr.vector_source_s(src_data)
+        op  = filter.fir_filter_scc(4, 20*[0.5+1j, 0.5+1j])
+        dst = gr.vector_sink_c()
+        self.tb.connect(src, op, dst)
+        self.tb.run()
+        result_data = dst.data()
+        self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5)
+
+    def test_fir_filter_fsf_001(self):
+        src_data = 40*[1, 2, 3, 4]
+        expected_data =(0, 1, 3, 5, 5, 6, 8, 10, 10, 11, 13, 15, 15, 16, 18, 20, 20,
+                        21, 23, 25, 25, 26, 28, 30, 30, 31, 33, 35, 35, 36, 38, 40, 40,
+                        41, 43, 45, 45, 46, 48, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+                        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+                        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+                        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+                        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+                        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+                        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+                        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50)
+        src = gr.vector_source_f(src_data)
+        op  = filter.fir_filter_fsf(1, 20*[0.5, 0.5])
+        dst = gr.vector_sink_s()
+        self.tb.connect(src, op, dst)
+        self.tb.run()
+        result_data = dst.data()
+        self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5)
+
+
+    def test_fir_filter_fsf_002(self):
+        src_data = 40*[1, 2, 3, 4]
+        expected_data = (0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 50, 50, 50, 50,
+                         50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+                         50, 50, 50, 50, 50, 50, 50, 50, 50, 50)
+        src = gr.vector_source_f(src_data)
+        op  = filter.fir_filter_fsf(4, 20*[0.5, 0.5])
+        dst = gr.vector_sink_s()
+        self.tb.connect(src, op, dst)
+        self.tb.run()
+        result_data = dst.data()
+        self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5)
+
 if __name__ == '__main__':
     gr_unittest.run(test_filter, "test_filter.xml")
 
diff --git a/gr-filter/swig/filter_swig.i b/gr-filter/swig/filter_swig.i
index cc15b5722..c9de3fb9a 100644
--- a/gr-filter/swig/filter_swig.i
+++ b/gr-filter/swig/filter_swig.i
@@ -36,6 +36,8 @@
 #include "filter/fir_filter_fff.h"
 #include "filter/fir_filter_ccf.h"
 #include "filter/fir_filter_ccc.h"
+#include "filter/fir_filter_scc.h"
+#include "filter/fir_filter_fsf.h"
 #include "filter/fft_filter_ccc.h"
 #include "filter/fft_filter_fff.h"
 #include "filter/hilbert_fc.h"
@@ -50,6 +52,8 @@
 %include "filter/fir_filter_fff.h"
 %include "filter/fir_filter_ccf.h"
 %include "filter/fir_filter_ccc.h"
+%include "filter/fir_filter_scc.h"
+%include "filter/fir_filter_fsf.h"
 %include "filter/fft_filter_ccc.h"
 %include "filter/fft_filter_fff.h"
 %include "filter/hilbert_fc.h"
@@ -61,6 +65,8 @@ GR_SWIG_BLOCK_MAGIC2(filter, filter_delay_fc);
 GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_fff);
 GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_ccf);
 GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_ccc);
+GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_scc);
+GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_fsf);
 GR_SWIG_BLOCK_MAGIC2(filter, fft_filter_ccc);
 GR_SWIG_BLOCK_MAGIC2(filter, fft_filter_fff);
 GR_SWIG_BLOCK_MAGIC2(filter, hilbert_fc);
diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
new file mode 100644
index 000000000..940aa5de7
--- /dev/null
+++ b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+#define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+
+  static const int N_UNROLL = 4;
+
+  lv_32fc_t acc0 = 0;
+  lv_32fc_t acc1 = 0;
+  lv_32fc_t acc2 = 0;
+  lv_32fc_t acc3 = 0;
+
+  unsigned i = 0;
+  unsigned n = (num_points / N_UNROLL) * N_UNROLL;
+
+  for(i = 0; i < n; i += N_UNROLL) {
+    acc0 += taps[i + 0] * (float)input[i + 0];
+    acc1 += taps[i + 1] * (float)input[i + 1];
+    acc2 += taps[i + 2] * (float)input[i + 2];
+    acc3 += taps[i + 3] * (float)input[i + 3];
+  }
+
+  for(; i < num_points; i++) {
+    acc0 += taps[i] * (float)input[i];
+  }
+
+  *result = acc0 + acc1 + acc2 + acc3;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const  short* input, const  lv_32fc_t* taps, unsigned int num_points) {
+
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 8;
+
+  float res[2];
+  float *realpt = &res[0], *imagpt = &res[1];
+  const short* aPtr = input;
+  const float* bPtr = (float*)taps;
+
+  __m64  m0, m1;
+  __m128 f0, f1, f2, f3;
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
+
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
+    m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
+    f0 = _mm_cvtpi16_ps(m0);
+    f1 = _mm_cvtpi16_ps(m0);
+    f2 = _mm_cvtpi16_ps(m1);
+    f3 = _mm_cvtpi16_ps(m1);
+
+    a0Val = _mm_unpacklo_ps(f0, f1);
+    a1Val = _mm_unpackhi_ps(f0, f1);
+    a2Val = _mm_unpacklo_ps(f2, f3);
+    a3Val = _mm_unpackhi_ps(f2, f3);
+
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+    aPtr += 8;
+    bPtr += 16;
+  }
+
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  *realpt = dotProductVector[0];
+  *imagpt = dotProductVector[1];
+  *realpt += dotProductVector[2];
+  *imagpt += dotProductVector[3];
+
+  number = sixteenthPoints*8;
+  for(;number < num_points; number++){
+    *realpt += ((*aPtr)   * (*bPtr++));
+    *imagpt += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h
new file mode 100644
index 000000000..961c2418c
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h
@@ -0,0 +1,98 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+#define INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = (int16_t)dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const  float* input, const  float* taps, unsigned int num_points) {
+
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
+
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+    aPtr += 16;
+    bPtr += 16;
+  }
+
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints*16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = (short)dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_a_H*/
author	Tom Rondeau	2012-06-15 08:43:20 -0400
committer	Tom Rondeau	2012-06-15 08:43:20 -0400
commit	5585c71229cfa7886e0bd090828cd1f5104f6b27 (patch)
tree	c80bcc8821fb10a44c073ce1fa2a0f4816027bef
parent	a74286a2aa7fcddb52c165ba2c17cb2f55b5b592 (diff)
download	gnuradio-5585c71229cfa7886e0bd090828cd1f5104f6b27.tar.gz gnuradio-5585c71229cfa7886e0bd090828cd1f5104f6b27.tar.bz2 gnuradio-5585c71229cfa7886e0bd090828cd1f5104f6b27.zip