diff options
-rw-r--r-- | gr-filter/include/filter/CMakeLists.txt | 2 | ||||
-rw-r--r-- | gr-filter/include/filter/fir_filter.h | 62 | ||||
-rw-r--r-- | gr-filter/lib/CMakeLists.txt | 2 | ||||
-rw-r--r-- | gr-filter/lib/fir_filter.cc | 232 | ||||
-rwxr-xr-x | gr-filter/python/qa_fir_filter.py | 95 | ||||
-rw-r--r-- | gr-filter/swig/filter_swig.i | 6 | ||||
-rw-r--r-- | volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h | 122 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_dot_prod_16i_a.h | 98 |
8 files changed, 617 insertions, 2 deletions
diff --git a/gr-filter/include/filter/CMakeLists.txt b/gr-filter/include/filter/CMakeLists.txt index 5b209873c..2620d3f54 100644 --- a/gr-filter/include/filter/CMakeLists.txt +++ b/gr-filter/include/filter/CMakeLists.txt @@ -64,7 +64,7 @@ endmacro(expand_h) ######################################################################## # Invoke macro to generate various sources ####################################################################### -expand_h(fir_filter_XXX fff ccf ccc) +expand_h(fir_filter_XXX fff ccf ccc scc fsf) add_custom_target(filter_generated_includes DEPENDS ${generated_includes} diff --git a/gr-filter/include/filter/fir_filter.h b/gr-filter/include/filter/fir_filter.h index 8bfaa4f50..1fb3afb4d 100644 --- a/gr-filter/include/filter/fir_filter.h +++ b/gr-filter/include/filter/fir_filter.h @@ -122,6 +122,68 @@ namespace gr { int d_naligned; }; + /**************************************************************/ + + class FILTER_API fir_filter_scc + { + public: + fir_filter_scc(int decimation, + const std::vector<gr_complex> &taps); + ~fir_filter_scc(); + + void set_taps(const std::vector<gr_complex> &taps); + std::vector<gr_complex> taps() const; + unsigned int ntaps() const; + + gr_complex filter(const short input[]); + void filterN(gr_complex output[], + const short input[], + unsigned long n); + void filterNdec(gr_complex output[], + const short input[], + unsigned long n, + unsigned int decimate); + + private: + unsigned int d_ntaps; + gr_complex *d_taps; + gr_complex **d_aligned_taps; + gr_complex *d_output; + int d_align; + int d_naligned; + }; + + /**************************************************************/ + + class FILTER_API fir_filter_fsf + { + public: + fir_filter_fsf(int decimation, + const std::vector<float> &taps); + ~fir_filter_fsf(); + + void set_taps(const std::vector<float> &taps); + std::vector<float> taps() const; + unsigned int ntaps() const; + + short filter(const float input[]); + void filterN(short output[], + const float input[], + unsigned long n); + void filterNdec(short output[], + const float input[], + unsigned long n, + unsigned int decimate); + + private: + unsigned int d_ntaps; + float *d_taps; + float **d_aligned_taps; + short *d_output; + int d_align; + int d_naligned; + }; + } /* namespace kernel */ } /* namespace filter */ } /* namespace gr */ diff --git a/gr-filter/lib/CMakeLists.txt b/gr-filter/lib/CMakeLists.txt index b51a23bab..f5dbd1bb3 100644 --- a/gr-filter/lib/CMakeLists.txt +++ b/gr-filter/lib/CMakeLists.txt @@ -80,7 +80,7 @@ endmacro(expand_cc) ######################################################################## # Invoke macro to generate various sources ######################################################################## -expand_cc(fir_filter_XXX_impl fff ccf ccc) +expand_cc(fir_filter_XXX_impl fff ccf ccc scc fsf) ######################################################################## diff --git a/gr-filter/lib/fir_filter.cc b/gr-filter/lib/fir_filter.cc index 18568da9d..be8017400 100644 --- a/gr-filter/lib/fir_filter.cc +++ b/gr-filter/lib/fir_filter.cc @@ -349,6 +349,238 @@ namespace gr { } } + /**************************************************************/ + + fir_filter_scc::fir_filter_scc(int decimation, + const std::vector<gr_complex> &taps) + { + d_align = volk_get_alignment(); + d_naligned = d_align / sizeof(short); + + d_taps = NULL; + set_taps(taps); + + // Make sure the output sample is always aligned, too. + d_output = fft::malloc_complex(1); + } + + fir_filter_scc::~fir_filter_scc() + { + // Free taps + if(d_taps != NULL) { + fft::free(d_taps); + d_taps = NULL; + } + + // Free all aligned taps + for(int i = 0; i < d_naligned; i++) { + fft::free(d_aligned_taps[i]); + } + fft::free(d_aligned_taps); + + // Free output sample + fft::free(d_output); + } + + void + fir_filter_scc::set_taps(const std::vector<gr_complex> &taps) + { + // Free the taps if already allocated + if(d_taps != NULL) { + fft::free(d_taps); + d_taps = NULL; + + for(int i = 0; i < d_naligned; i++) { + fft::free(d_aligned_taps[i]); + } + fft::free(d_aligned_taps); + } + + d_ntaps = (int)taps.size(); + d_taps = fft::malloc_complex(d_ntaps); + for(unsigned int i = 0; i < d_ntaps; i++) { + d_taps[d_ntaps-i-1] = taps[i]; + } + + // Make a set of taps at all possible arch alignments + d_aligned_taps = (gr_complex**)malloc(d_naligned*sizeof(gr_complex**)); + for(int i = 0; i < d_naligned; i++) { + d_aligned_taps[i] = fft::malloc_complex(d_ntaps+d_naligned-1); + memset(d_aligned_taps[i], 0, sizeof(gr_complex)*(d_ntaps+d_naligned-1)); + memcpy(&d_aligned_taps[i][i], d_taps, sizeof(gr_complex)*(d_ntaps)); + } + + } + + std::vector<gr_complex> + fir_filter_scc::taps() const + { + std::vector<gr_complex> t; + for(unsigned int i = 0; i < d_ntaps; i++) + t.push_back(d_taps[d_ntaps-i-1]); + return t; + } + + unsigned int + fir_filter_scc::ntaps() const + { + return d_ntaps; + } + + gr_complex + fir_filter_scc::filter(const short input[]) + { + const short *ar = (short *)((unsigned long) input & ~(d_align-1)); + unsigned al = input - ar; + + volk_16i_32fc_dot_prod_32fc_a(d_output, ar, + d_aligned_taps[al], + (d_ntaps+al)); + + return *d_output; + } + + void + fir_filter_scc::filterN(gr_complex output[], + const short input[], + unsigned long n) + { + for(unsigned long i = 0; i < n; i++) + output[i] = filter(&input[i]); + } + + + void + fir_filter_scc::filterNdec(gr_complex output[], + const short input[], + unsigned long n, + unsigned int decimate) + { + unsigned long j = 0; + for(unsigned long i = 0; i < n; i++){ + output[i] = filter(&input[j]); + j += decimate; + } + } + + /**************************************************************/ + + fir_filter_fsf::fir_filter_fsf(int decimation, + const std::vector<float> &taps) + { + d_align = volk_get_alignment(); + d_naligned = d_align / sizeof(float); + + d_taps = NULL; + set_taps(taps); + + // Make sure the output sample is always aligned, too. + d_output = (short*)fft::malloc_float(1); + } + + fir_filter_fsf::~fir_filter_fsf() + { + // Free taps + if(d_taps != NULL) { + fft::free(d_taps); + d_taps = NULL; + } + + // Free all aligned taps + for(int i = 0; i < d_naligned; i++) { + fft::free(d_aligned_taps[i]); + } + fft::free(d_aligned_taps); + + // Free output sample + fft::free(d_output); + } + + void + fir_filter_fsf::set_taps(const std::vector<float> &taps) + { + // Free the taps if already allocated + if(d_taps != NULL) { + fft::free(d_taps); + d_taps = NULL; + + for(int i = 0; i < d_naligned; i++) { + fft::free(d_aligned_taps[i]); + } + fft::free(d_aligned_taps); + } + + d_ntaps = (int)taps.size(); + d_taps = fft::malloc_float(d_ntaps); + for(unsigned int i = 0; i < d_ntaps; i++) { + d_taps[d_ntaps-i-1] = taps[i]; + } + + // Make a set of taps at all possible arch alignments + d_aligned_taps = (float**)malloc(d_naligned*sizeof(float**)); + for(int i = 0; i < d_naligned; i++) { + d_aligned_taps[i] = fft::malloc_float(d_ntaps+d_naligned-1); + memset(d_aligned_taps[i], 0, sizeof(float)*(d_ntaps+d_naligned-1)); + memcpy(&d_aligned_taps[i][i], d_taps, sizeof(float)*(d_ntaps)); + } + } + + std::vector<float> + fir_filter_fsf::taps() const + { + std::vector<float> t; + for(unsigned int i = 0; i < d_ntaps; i++) + t.push_back(d_taps[d_ntaps-i-1]); + return t; + } + + unsigned int + fir_filter_fsf::ntaps() const + { + return d_ntaps; + } + + short + fir_filter_fsf::filter(const float input[]) + { + const float *ar = (float *)((unsigned long) input & ~(d_align-1)); + unsigned al = input - ar; + + volk_32f_x2_dot_prod_16i_a(d_output, ar, + d_aligned_taps[al], + (d_ntaps+al)); + + //float out = 0; + //for(unsigned int i = 0; i < d_ntaps; i++) { + // out += d_taps[i] * input[i]; + //} + //*d_output = (short)out; + + return *d_output; + } + + void + fir_filter_fsf::filterN(short output[], + const float input[], + unsigned long n) + { + for(unsigned long i = 0; i < n; i++) + output[i] = filter(&input[i]); + } + + void + fir_filter_fsf::filterNdec(short output[], + const float input[], + unsigned long n, + unsigned int decimate) + { + unsigned long j = 0; + for(unsigned long i = 0; i < n; i++){ + output[i] = filter(&input[j]); + j += decimate; + } + } + } /* namespace kernel */ } /* namespace filter */ } /* namespace gr */ diff --git a/gr-filter/python/qa_fir_filter.py b/gr-filter/python/qa_fir_filter.py index ac20286cc..2a61498a2 100755 --- a/gr-filter/python/qa_fir_filter.py +++ b/gr-filter/python/qa_fir_filter.py @@ -218,6 +218,101 @@ class test_filter(gr_unittest.TestCase): self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5) + def test_fir_filter_scc_001(self): + src_data = 40*[1, 2, 3, 4] + expected_data = ((0.5+1j), (1.5+3j), (3+6j), (5+10j), (5.5+11j), + (6.5+13j), (8+16j), (10+20j), (10.5+21j), (11.5+23j), + (13+26j), (15+30j), (15.5+31j), (16.5+33j), (18+36j), + (20+40j), (20.5+41j), (21.5+43j), (23+46j), (25+50j), + (25.5+51j), (26.5+53j), (28+56j), (30+60j), (30.5+61j), + (31.5+63j), (33+66j), (35+70j), (35.5+71j), (36.5+73j), + (38+76j), (40+80j), (40.5+81j), (41.5+83j), (43+86j), + (45+90j), (45.5+91j), (46.5+93j), (48+96j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j)) + src = gr.vector_source_s(src_data) + op = filter.fir_filter_scc(1, 20*[0.5+1j, 0.5+1j]) + dst = gr.vector_sink_c() + self.tb.connect(src, op, dst) + self.tb.run() + result_data = dst.data() + self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5) + + + def test_fir_filter_scc_002(self): + src_data = 40*[1, 2, 3, 4] + expected_data = ((0.5+1j), (5.5+11j), (10.5+21j), (15.5+31j), (20.5+41j), + (25.5+51j), (30.5+61j), (35.5+71j), (40.5+81j), (45.5+91j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j), + (50+100j), (50+100j), (50+100j), (50+100j), (50+100j)) + src = gr.vector_source_s(src_data) + op = filter.fir_filter_scc(4, 20*[0.5+1j, 0.5+1j]) + dst = gr.vector_sink_c() + self.tb.connect(src, op, dst) + self.tb.run() + result_data = dst.data() + self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5) + + def test_fir_filter_fsf_001(self): + src_data = 40*[1, 2, 3, 4] + expected_data =(0, 1, 3, 5, 5, 6, 8, 10, 10, 11, 13, 15, 15, 16, 18, 20, 20, + 21, 23, 25, 25, 26, 28, 30, 30, 31, 33, 35, 35, 36, 38, 40, 40, + 41, 43, 45, 45, 46, 48, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50) + src = gr.vector_source_f(src_data) + op = filter.fir_filter_fsf(1, 20*[0.5, 0.5]) + dst = gr.vector_sink_s() + self.tb.connect(src, op, dst) + self.tb.run() + result_data = dst.data() + self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5) + + + def test_fir_filter_fsf_002(self): + src_data = 40*[1, 2, 3, 4] + expected_data = (0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50) + src = gr.vector_source_f(src_data) + op = filter.fir_filter_fsf(4, 20*[0.5, 0.5]) + dst = gr.vector_sink_s() + self.tb.connect(src, op, dst) + self.tb.run() + result_data = dst.data() + self.assertComplexTuplesAlmostEqual(expected_data, result_data, 5) + if __name__ == '__main__': gr_unittest.run(test_filter, "test_filter.xml") diff --git a/gr-filter/swig/filter_swig.i b/gr-filter/swig/filter_swig.i index cc15b5722..c9de3fb9a 100644 --- a/gr-filter/swig/filter_swig.i +++ b/gr-filter/swig/filter_swig.i @@ -36,6 +36,8 @@ #include "filter/fir_filter_fff.h" #include "filter/fir_filter_ccf.h" #include "filter/fir_filter_ccc.h" +#include "filter/fir_filter_scc.h" +#include "filter/fir_filter_fsf.h" #include "filter/fft_filter_ccc.h" #include "filter/fft_filter_fff.h" #include "filter/hilbert_fc.h" @@ -50,6 +52,8 @@ %include "filter/fir_filter_fff.h" %include "filter/fir_filter_ccf.h" %include "filter/fir_filter_ccc.h" +%include "filter/fir_filter_scc.h" +%include "filter/fir_filter_fsf.h" %include "filter/fft_filter_ccc.h" %include "filter/fft_filter_fff.h" %include "filter/hilbert_fc.h" @@ -61,6 +65,8 @@ GR_SWIG_BLOCK_MAGIC2(filter, filter_delay_fc); GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_fff); GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_ccf); GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_ccc); +GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_scc); +GR_SWIG_BLOCK_MAGIC2(filter, fir_filter_fsf); GR_SWIG_BLOCK_MAGIC2(filter, fft_filter_ccc); GR_SWIG_BLOCK_MAGIC2(filter, fft_filter_fff); GR_SWIG_BLOCK_MAGIC2(filter, hilbert_fc); diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h new file mode 100644 index 000000000..940aa5de7 --- /dev/null +++ b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H +#define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H + +#include <volk/volk_common.h> +#include<stdio.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { + + static const int N_UNROLL = 4; + + lv_32fc_t acc0 = 0; + lv_32fc_t acc1 = 0; + lv_32fc_t acc2 = 0; + lv_32fc_t acc3 = 0; + + unsigned i = 0; + unsigned n = (num_points / N_UNROLL) * N_UNROLL; + + for(i = 0; i < n; i += N_UNROLL) { + acc0 += taps[i + 0] * (float)input[i + 0]; + acc1 += taps[i + 1] * (float)input[i + 1]; + acc2 += taps[i + 2] * (float)input[i + 2]; + acc3 += taps[i + 3] * (float)input[i + 3]; + } + + for(; i < num_points; i++) { + acc0 += taps[i] * (float)input[i]; + } + + *result = acc0 + acc1 + acc2 + acc3; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE + + +static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 8; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const short* aPtr = input; + const float* bPtr = (float*)taps; + + __m64 m0, m1; + __m128 f0, f1, f2, f3; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0)); + m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4)); + f0 = _mm_cvtpi16_ps(m0); + f1 = _mm_cvtpi16_ps(m0); + f2 = _mm_cvtpi16_ps(m1); + f3 = _mm_cvtpi16_ps(m1); + + a0Val = _mm_unpacklo_ps(f0, f1); + a1Val = _mm_unpackhi_ps(f0, f1); + a2Val = _mm_unpacklo_ps(f2, f3); + a3Val = _mm_unpackhi_ps(f2, f3); + + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 8; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + + number = sixteenthPoints*8; + for(;number < num_points; number++){ + *realpt += ((*aPtr) * (*bPtr++)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); +} + +#endif /*LV_HAVE_SSE*/ + + +#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h new file mode 100644 index 000000000..961c2418c --- /dev/null +++ b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h @@ -0,0 +1,98 @@ +#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_a_H +#define INCLUDED_volk_32f_x2_dot_prod_16i_a_H + +#include <volk/volk_common.h> +#include<stdio.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (int16_t)dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE + + +static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_a_H*/ |