diff options
-rw-r--r-- | gr-filter/include/filter/fir_filter.h | 2 | ||||
-rw-r--r-- | gr-filter/lib/fir_filter.cc | 30 | ||||
-rw-r--r-- | gr-filter/lib/fir_filter_XXX_impl.cc.t | 2 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h | 111 |
4 files changed, 137 insertions, 8 deletions
diff --git a/gr-filter/include/filter/fir_filter.h b/gr-filter/include/filter/fir_filter.h index 3ce21d304..8bfaa4f50 100644 --- a/gr-filter/include/filter/fir_filter.h +++ b/gr-filter/include/filter/fir_filter.h @@ -84,7 +84,7 @@ namespace gr { private: unsigned int d_ntaps; - gr_complex *d_taps; + float *d_taps; float **d_aligned_taps; gr_complex *d_output; int d_align; diff --git a/gr-filter/lib/fir_filter.cc b/gr-filter/lib/fir_filter.cc index 973be45d4..18568da9d 100644 --- a/gr-filter/lib/fir_filter.cc +++ b/gr-filter/lib/fir_filter.cc @@ -145,8 +145,14 @@ namespace gr { fir_filter_ccf::fir_filter_ccf(int decimation, const std::vector<float> &taps) { + d_align = volk_get_alignment(); + d_naligned = d_align / sizeof(gr_complex); + d_taps = NULL; set_taps(taps); + + // Make sure the output sample is always aligned, too. + d_output = fft::malloc_complex(1); } fir_filter_ccf::~fir_filter_ccf() @@ -167,9 +173,17 @@ namespace gr { } d_ntaps = (int)taps.size(); - d_taps = fft::malloc_complex(d_ntaps); + d_taps = fft::malloc_float(d_ntaps); for(unsigned int i = 0; i < d_ntaps; i++) { - d_taps[d_ntaps-i-1] = gr_complex(taps[i],0); + d_taps[d_ntaps-i-1] = taps[i]; + } + + // Make a set of taps at all possible arch alignments + d_aligned_taps = (float**)malloc(d_naligned*sizeof(float**)); + for(int i = 0; i < d_naligned; i++) { + d_aligned_taps[i] = fft::malloc_float(d_ntaps+d_naligned-1); + memset(d_aligned_taps[i], 0, sizeof(float)*(d_ntaps+d_naligned-1)); + memcpy(&d_aligned_taps[i][i], d_taps, sizeof(float)*(d_ntaps)); } } @@ -178,7 +192,7 @@ namespace gr { { std::vector<float> t; for(unsigned int i = 0; i < d_ntaps; i++) - t.push_back(d_taps[d_ntaps-i-1].real()); + t.push_back(d_taps[d_ntaps-i-1]); return t; } @@ -191,9 +205,13 @@ namespace gr { gr_complex fir_filter_ccf::filter(const gr_complex input[]) { - gr_complex output; - volk_32fc_x2_dot_prod_32fc_u(&output, input, d_taps, d_ntaps); - return output; + const gr_complex *ar = (gr_complex *)((unsigned long) input & ~(d_align-1)); + unsigned al = input - ar; + + volk_32fc_32f_dot_prod_32fc_a(d_output, ar, + d_aligned_taps[al], + (d_ntaps+al)); + return *d_output; } void diff --git a/gr-filter/lib/fir_filter_XXX_impl.cc.t b/gr-filter/lib/fir_filter_XXX_impl.cc.t index 18bec38be..8ea8151f8 100644 --- a/gr-filter/lib/fir_filter_XXX_impl.cc.t +++ b/gr-filter/lib/fir_filter_XXX_impl.cc.t @@ -40,7 +40,7 @@ namespace gr { @IMPL_NAME@::@IMPL_NAME@(int decimation, const std::vector<@TAP_TYPE@> &taps) - : gr_sync_decimator("fir_filter_fff", + : gr_sync_decimator("@BASE_NAME@", gr_make_io_signature(1, 1, sizeof(@I_TYPE@)), gr_make_io_signature(1, 1, sizeof(@O_TYPE@)), decimation) diff --git a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h new file mode 100644 index 000000000..109b787e8 --- /dev/null +++ b/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h @@ -0,0 +1,111 @@ +#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H +#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H + +#include <volk/volk_common.h> +#include<stdio.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32fc_32f_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) { + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* aPtr = (float*)input; + const float* bPtr= taps; + unsigned int number = 0; + + *realpt = 0; + *imagpt = 0; + + for(number = 0; number < num_points; number++){ + *realpt += ((*aPtr++) * (*bPtr)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE + + +static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 8; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* aPtr = (float*)input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 x0Val, x1Val, x2Val, x3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + + x0Val = _mm_load_ps(bPtr); + x1Val = _mm_load_ps(bPtr); + x2Val = _mm_load_ps(bPtr+4); + x3Val = _mm_load_ps(bPtr+4); + b0Val = _mm_unpacklo_ps(x0Val, x1Val); + b1Val = _mm_unpackhi_ps(x0Val, x1Val); + b2Val = _mm_unpacklo_ps(x2Val, x3Val); + b3Val = _mm_unpackhi_ps(x2Val, x3Val); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 8; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + + number = sixteenthPoints*8; + for(;number < num_points; number++){ + *realpt += ((*aPtr++) * (*bPtr)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); +} + +#endif /*LV_HAVE_SSE*/ + + +#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H*/ |