summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gr-filter/include/filter/CMakeLists.txt2
-rw-r--r--gr-filter/include/filter/fir_filter.h62
-rw-r--r--gr-filter/lib/CMakeLists.txt2
-rw-r--r--gr-filter/lib/fir_filter.cc232
-rwxr-xr-xgr-filter/python/qa_fir_filter.py95
-rw-r--r--gr-filter/swig/filter_swig.i6
-rw-r--r--volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h122
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_16i_a.h98
8 files changed, 617 insertions, 2 deletions
diff --git a/gr-filter/include/filter/CMakeLists.txt b/gr-filter/include/filter/CMakeLists.txt
index 5b209873c..2620d3f54 100644
--- a/gr-filter/include/filter/CMakeLists.txt
+++ b/gr-filter/include/filter/CMakeLists.txt
@@ -64,7 +64,7 @@ endmacro(expand_h)
########################################################################
# Invoke macro to generate various sources
#######################################################################
-expand_h(fir_filter_XXX fff ccf ccc)
+expand_h(fir_filter_XXX fff ccf ccc scc fsf)
add_custom_target(filter_generated_includes DEPENDS
${generated_includes}
diff --git a/gr-filter/include/filter/fir_filter.h b/gr-filter/include/filter/fir_filter.h
index 8bfaa4f50..1fb3afb4d 100644
--- a/gr-filter/include/filter/fir_filter.h
+++ b/gr-filter/include/filter/fir_filter.h
@@ -122,6 +122,68 @@ namespace gr {
int d_naligned;
};
+ /**************************************************************/
+
+ class FILTER_API fir_filter_scc
+ {
+ public:
+ fir_filter_scc(int decimation,
+ const std::vector<gr_complex> &taps);
+ ~fir_filter_scc();
+
+ void set_taps(const std::vector<gr_complex> &taps);
+ std::vector<gr_complex> taps() const;
+ unsigned int ntaps() const;
+
+ gr_complex filter(const short input[]);
+ void filterN(gr_complex output[],
+ const short input[],
+ unsigned long n);
+ void filterNdec(gr_complex output[],
+ const short input[],
+ unsigned long n,
+ unsigned int decimate);
+
+ private:
+ unsigned int d_ntaps;
+ gr_complex *d_taps;
+ gr_complex **d_aligned_taps;
+ gr_complex *d_output;
+ int d_align;
+ int d_naligned;
+ };
+
+ /**************************************************************/
+
+ class FILTER_API fir_filter_fsf
+ {
+ public:
+ fir_filter_fsf(int decimation,
+ const std::vector<float> &taps);
+ ~fir_filter_fsf();
+
+ void set_taps(const std::vector<float> &taps);
+ std::vector<float> taps() const;
+ unsigned int ntaps() const;
+
+ short filter(const float input[]);
+ void filterN(short output[],
+ const float input[],
+ unsigned long n);
+ void filterNdec(short output[],
+ const float input[],
+ unsigned long n,
+ unsigned int decimate);
+
+ private:
+ unsigned int d_ntaps;
+ float *d_taps;
+ float **d_aligned_taps;
+ short *d_output;
+ int d_align;
+ int d_naligned;
+ };
+
} /* namespace kernel */
} /* namespace filter */
} /* namespace gr */
diff --git a/gr-filter/lib/CMakeLists.txt b/gr-filter/lib/CMakeLists.txt
index b51a23bab..f5dbd1bb3 100644
--- a/gr-filter/lib/CMakeLists.txt
+++ b/gr-filter/lib/CMakeLists.txt
@@ -80,7 +80,7 @@ endmacro(expand_cc)
########################################################################
# Invoke macro to generate various sources
########################################################################
-expand_cc(fir_filter_XXX_impl fff ccf ccc)
+expand_cc(fir_filter_XXX_impl fff ccf ccc scc fsf)
########################################################################
diff --git a/gr-filter/lib/fir_filter.cc b/gr-filter/lib/fir_filter.cc
index 18568da9d..be8017400 100644
--- a/gr-filter/lib/fir_filter.cc
+++ b/gr-filter/lib/fir_filter.cc
@@ -349,6 +349,238 @@ namespace gr {
}
}
+ /**************************************************************/
+
+ fir_filter_scc::fir_filter_scc(int decimation,
+ const std::vector<gr_complex> &taps)
+ {
+ d_align = volk_get_alignment();
+ d_naligned = d_align / sizeof(short);
+
+ d_taps = NULL;
+ set_taps(taps);
+
+ // Make sure the output sample is always aligned, too.
+ d_output = fft::malloc_complex(1);
+ }
+
+ fir_filter_scc::~fir_filter_scc()
+ {
+ // Free taps
+ if(d_taps != NULL) {
+ fft::free(d_taps);
+ d_taps = NULL;
+ }
+
+ // Free all aligned taps
+ for(int i = 0; i < d_naligned; i++) {
+ fft::free(d_aligned_taps[i]);
+ }
+ fft::free(d_aligned_taps);
+
+ // Free output sample
+ fft::free(d_output);
+ }
+
+ void
+ fir_filter_scc::set_taps(const std::vector<gr_complex> &taps)
+ {
+ // Free the taps if already allocated
+ if(d_taps != NULL) {
+ fft::free(d_taps);
+ d_taps = NULL;
+
+ for(int i = 0; i < d_naligned; i++) {
+ fft::free(d_aligned_taps[i]);
+ }
+ fft::free(d_aligned_taps);
+ }
+
+ d_ntaps = (int)taps.size();
+ d_taps = fft::malloc_complex(d_ntaps);
+ for(unsigned int i = 0; i < d_ntaps; i++) {
+ d_taps[d_ntaps-i-1] = taps[i];
+ }
+
+ // Make a set of taps at all possible arch alignments
+ d_aligned_taps = (gr_complex**)malloc(d_naligned*sizeof(gr_complex**));
+ for(int i = 0; i < d_naligned; i++) {
+ d_aligned_taps[i] = fft::malloc_complex(d_ntaps+d_naligned-1);
+ memset(d_aligned_taps[i], 0, sizeof(gr_complex)*(d_ntaps+d_naligned-1));
+ memcpy(&d_aligned_taps[i][i], d_taps, sizeof(gr_complex)*(d_ntaps));
+ }
+
+ }
+
+ std::vector<gr_complex>
+ fir_filter_scc::taps() const
+ {
+ std::vector<gr_complex> t;
+ for(unsigned int i = 0; i < d_ntaps; i++)
+ t.push_back(d_taps[d_ntaps-i-1]);
+ return t;
+ }
+
+ unsigned int
+ fir_filter_scc::ntaps() const
+ {
+ return d_ntaps;
+ }
+
+ gr_complex
+ fir_filter_scc::filter(const short input[])
+ {
+ const short *ar = (short *)((unsigned long) input & ~(d_align-1));
+ unsigned al = input - ar;
+
+ volk_16i_32fc_dot_prod_32fc_a(d_output, ar,
+ d_aligned_taps[al],
+ (d_ntaps+al));
+
+ return *d_output;
+ }
+
+ void
+ fir_filter_scc::filterN(gr_complex output[],
+ const short input[],
+ unsigned long n)
+ {
+ for(unsigned long i = 0; i < n; i++)
+ output[i] = filter(&input[i]);
+ }
+
+
+ void
+ fir_filter_scc::filterNdec(gr_complex output[],
+ const short input[],
+ unsigned long n,
+ unsigned int decimate)
+ {
+ unsigned long j = 0;
+ for(unsigned long i = 0; i < n; i++){
+ output[i] = filter(&input[j]);
+ j += decimate;
+ }
+ }
+
+ /**************************************************************/
+
+ fir_filter_fsf::fir_filter_fsf(int decimation,
+ const std::vector<float> &taps)
+ {
+ d_align = volk_get_alignment();
+ d_naligned = d_align / sizeof(float);
+
+ d_taps = NULL;
+ set_taps(taps);
+
+ // Make sure the output sample is always aligned, too.
+ d_output = (short*)fft::malloc_float(1);
+ }
+
+ fir_filter_fsf::~fir_filter_fsf()
+ {
+ // Free taps
+ if(d_taps != NULL) {
+ fft::free(d_taps);
+ d_taps = NULL;
+ }
+
+ // Free all aligned taps
+ for(int i = 0; i < d_naligned; i++) {
+ fft::free(d_aligned_taps[i]);
+ }
+ fft::free(d_aligned_taps);
+
+ // Free output sample
+ fft::free(d_output);
+ }
+
+ void
+ fir_filter_fsf::set_taps(const std::vector<float> &taps)
+ {
+ // Free the taps if already allocated
+ if(d_taps != NULL) {
+ fft::free(d_taps);
+ d_taps = NULL;
+
+ for(int i = 0; i < d_naligned; i++) {
+ fft::free(d_aligned_taps[i]);
+ }
+ fft::free(d_aligned_taps);
+ }
+
+ d_ntaps = (int)taps.size();
+ d_taps = fft::malloc_float(d_ntaps);
+ for(unsigned int i = 0; i < d_ntaps; i++) {
+ d_taps[d_ntaps-i-1] = taps[i];
+ }
+
+ // Make a set of taps at all possible arch alignments
+ d_aligned_taps = (float**)malloc(d_naligned*sizeof(float**));
+ for(int i = 0; i < d_naligned; i++) {
+ d_aligned_taps[i] = fft::malloc_float(d_ntaps+d_naligned-1);
+ memset(d_aligned_taps[i], 0, sizeof(float)*(d_ntaps+d_naligned-1));
+ memcpy(&d_aligned_taps[i][i], d_taps, sizeof(float)*(d_ntaps));
+ }
+ }
+
+ std::vector<float>
+ fir_filter_fsf::taps() const
+ {
+ std::vector<float> t;
+ for(unsigned int i = 0; i < d_ntaps; i++)
+ t.push_back(d_taps[d_ntaps-i-1]);
+ return t;
+ }
+
+ unsigned int
+ fir_filter_fsf::ntaps() const
+ {
+ return d_ntaps;
+ }
+
+ short
+ fir_filter_fsf::filter(const float input[])
+ {
+ const float *ar = (float *)((unsigned long) input & ~(d_align-1));
+ unsigned al = input - ar;
+
+ volk_32f_x2_dot_prod_16i_a(d_output, ar,
+ d_aligned_taps[al],
+ (d_ntaps+al));
+
+ //float out = 0;
+ //for(unsigned int i = 0; i < d_ntaps; i++) {
+ // out += d_taps[i] * input[i];
+ //}
+ //*d_output = (short)out;
+
+ return *d_output;
+ }
+
+ void
+ fir_filter_fsf::filterN(short output[],
+ const float input[],
+ unsigned long n)
+ {
+ for(unsigned long i = 0; i < n; i++)
+ output[i] = filter(&input[i]);
+ }
+
+ void
+ fir_filter_fsf::filterNdec(short output[],
+ const float input[],
+ unsigned long n,
+ unsigned int decimate)
+ {
+ unsigned long j = 0;
+ for(unsigned long i = 0; i < n; i++){
+ output[i] = filter(&input[j]);
+ j += decimate;
+ }
+ }
+
} /* namespace kernel */
} /* namespace filter */
} /* namespace gr */
diff --git a/gr-filter/python/qa_fir_filter.py b/gr-filter/python/qa_fir_filter.py
index ac20286cc..2a61498a2 100755
--- a/gr-filter/python/qa_fir_filter.py
+++ b/gr-filter/python/qa_fir_filter.py
@@ -218,6 +218,101 @@ class test_filter(gr_unittest.TestCase):
self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5)
+ def test_fir_filter_scc_001(self):
+ src_data = 40*[1, 2, 3, 4]
+ expected_data = ((0.5+1j), (1.5+3j), (3+6j), (5+10j), (5.5+11j),
+ (6.5+13j), (8+16j), (10+20j), (10.5+21j), (11.5+23j),
+ (13+26j), (15+30j), (15.5+31j), (16.5+33j), (18+36j),
+ (20+40j), (20.5+41j), (21.5+43j), (23+46j), (25+50j),
+ (25.5+51j), (26.5+53j), (28+56j), (30+60j), (30.5+61j),
+ (31.5+63j), (33+66j), (35+70j), (35.5+71j), (36.5+73j),
+ (38+76j), (40+80j), (40.5+81j), (41.5+83j), (43+86j),
+ (45+90j), (45.5+91j), (46.5+93j), (48+96j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j))
+ src = gr.vector_source_s(src_data)
+ op = filter.fir_filter_scc(1, 20*[0.5+1j, 0.5+1j])
+ dst = gr.vector_sink_c()
+ self.tb.connect(src, op, dst)
+ self.tb.run()
+ result_data = dst.data()
+ self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5)
+
+
+ def test_fir_filter_scc_002(self):
+ src_data = 40*[1, 2, 3, 4]
+ expected_data = ((0.5+1j), (5.5+11j), (10.5+21j), (15.5+31j), (20.5+41j),
+ (25.5+51j), (30.5+61j), (35.5+71j), (40.5+81j), (45.5+91j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j),
+ (50+100j), (50+100j), (50+100j), (50+100j), (50+100j))
+ src = gr.vector_source_s(src_data)
+ op = filter.fir_filter_scc(4, 20*[0.5+1j, 0.5+1j])
+ dst = gr.vector_sink_c()
+ self.tb.connect(src, op, dst)
+ self.tb.run()
+ result_data = dst.data()
+ self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5)
+
+ def test_fir_filter_fsf_001(self):
+ src_data = 40*[1, 2, 3, 4]
+ expected_data =(0, 1, 3, 5, 5, 6, 8, 10, 10, 11, 13, 15, 15, 16, 18, 20, 20,
+ 21, 23, 25, 25, 26, 28, 30, 30, 31, 33, 35, 35, 36, 38, 40, 40,
+ 41, 43, 45, 45, 46, 48, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50)
+ src = gr.vector_source_f(src_data)
+ op = filter.fir_filter_fsf(1, 20*[0.5, 0.5])
+ dst = gr.vector_sink_s()
+ self.tb.connect(src, op, dst)
+ self.tb.run()
+ result_data = dst.data()
+ self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5)
+
+
+ def test_fir_filter_fsf_002(self):
+ src_data = 40*[1, 2, 3, 4]
+ expected_data = (0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+ 50, 50, 50, 50, 50, 50, 50, 50, 50, 50)
+ src = gr.vector_source_f(src_data)
+ op = filter.fir_filter_fsf(4, 20*[0.5, 0.5])
+ dst = gr.vector_sink_s()
+ self.tb.connect(src, op, dst)
+ self.tb.run()
+ result_data = dst.data()
+ self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5)
+
if __name__ == '__main__':
gr_unittest.run(test_filter, "test_filter.xml")
diff --git a/gr-filter/swig/filter_swig.i b/gr-filter/swig/filter_swig.i
index cc15b5722..c9de3fb9a 100644
--- a/gr-filter/swig/filter_swig.i
+++ b/gr-filter/swig/filter_swig.i
@@ -36,6 +36,8 @@
#include "filter/fir_filter_fff.h"
#include "filter/fir_filter_ccf.h"
#include "filter/fir_filter_ccc.h"
+#include "filter/fir_filter_scc.h"
+#include "filter/fir_filter_fsf.h"
#include "filter/fft_filter_ccc.h"
#include "filter/fft_filter_fff.h"
#include "filter/hilbert_fc.h"
@@ -50,6 +52,8 @@
%include "filter/fir_filter_fff.h"
%include "filter/fir_filter_ccf.h"
%include "filter/fir_filter_ccc.h"
+%include "filter/fir_filter_scc.h"
+%include "filter/fir_filter_fsf.h"
%include "filter/fft_filter_ccc.h"
%include "filter/fft_filter_fff.h"
%include "filter/hilbert_fc.h"
@@ -61,6 +65,8 @@ GR_SWIG_BLOCK_MAGIC2(filter, filter_delay_fc);
GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_fff);
GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_ccf);
GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_ccc);
+GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_scc);
+GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_fsf);
GR_SWIG_BLOCK_MAGIC2(filter, fft_filter_ccc);
GR_SWIG_BLOCK_MAGIC2(filter, fft_filter_fff);
GR_SWIG_BLOCK_MAGIC2(filter, hilbert_fc);
diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
new file mode 100644
index 000000000..940aa5de7
--- /dev/null
+++ b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+#define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+
+ static const int N_UNROLL = 4;
+
+ lv_32fc_t acc0 = 0;
+ lv_32fc_t acc1 = 0;
+ lv_32fc_t acc2 = 0;
+ lv_32fc_t acc3 = 0;
+
+ unsigned i = 0;
+ unsigned n = (num_points / N_UNROLL) * N_UNROLL;
+
+ for(i = 0; i < n; i += N_UNROLL) {
+ acc0 += taps[i + 0] * (float)input[i + 0];
+ acc1 += taps[i + 1] * (float)input[i + 1];
+ acc2 += taps[i + 2] * (float)input[i + 2];
+ acc3 += taps[i + 3] * (float)input[i + 3];
+ }
+
+ for(; i < num_points; i++) {
+ acc0 += taps[i] * (float)input[i];
+ }
+
+ *result = acc0 + acc1 + acc2 + acc3;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 8;
+
+ float res[2];
+ float *realpt = &res[0], *imagpt = &res[1];
+ const short* aPtr = input;
+ const float* bPtr = (float*)taps;
+
+ __m64 m0, m1;
+ __m128 f0, f1, f2, f3;
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
+ m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
+ f0 = _mm_cvtpi16_ps(m0);
+ f1 = _mm_cvtpi16_ps(m0);
+ f2 = _mm_cvtpi16_ps(m1);
+ f3 = _mm_cvtpi16_ps(m1);
+
+ a0Val = _mm_unpacklo_ps(f0, f1);
+ a1Val = _mm_unpackhi_ps(f0, f1);
+ a2Val = _mm_unpacklo_ps(f2, f3);
+ a3Val = _mm_unpackhi_ps(f2, f3);
+
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 8;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ *realpt = dotProductVector[0];
+ *imagpt = dotProductVector[1];
+ *realpt += dotProductVector[2];
+ *imagpt += dotProductVector[3];
+
+ number = sixteenthPoints*8;
+ for(;number < num_points; number++){
+ *realpt += ((*aPtr) * (*bPtr++));
+ *imagpt += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h
new file mode 100644
index 000000000..961c2418c
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h
@@ -0,0 +1,98 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+#define INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr= taps;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (int16_t)dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
+
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = (short)dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_a_H*/