From 9e2e896e9d4dbc4627702cde82a48e9ee5136f26 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 13 Jun 2012 14:53:41 -0400 Subject: filter: process 4 vectors each time in volk dot_prod to speed up fir filters. This makes the volk version of the SSE FIR filter the same speed as using the hand-crafted float_dotprod from before. --- gr-filter/include/filter/fir_filter.h | 2 +- gr-filter/lib/fir_filter.cc | 38 ++++++++++++++++++-- volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 48 +++++++++++++++++++------- 3 files changed, 72 insertions(+), 16 deletions(-) diff --git a/gr-filter/include/filter/fir_filter.h b/gr-filter/include/filter/fir_filter.h index 1eb70f92a..e6194df7b 100644 --- a/gr-filter/include/filter/fir_filter.h +++ b/gr-filter/include/filter/fir_filter.h @@ -102,7 +102,7 @@ namespace gr { gr_complex filter(const gr_complex input[]); void filterN(gr_complex output[], const gr_complex input[], - unsigned long n) + unsigned long n); void filterNdec(gr_complex output[], const gr_complex input[], unsigned long n, diff --git a/gr-filter/lib/fir_filter.cc b/gr-filter/lib/fir_filter.cc index 098dd8d36..c6e179246 100644 --- a/gr-filter/lib/fir_filter.cc +++ b/gr-filter/lib/fir_filter.cc @@ -24,6 +24,7 @@ #include #include #include +#include namespace gr { namespace filter { @@ -101,12 +102,16 @@ namespace gr { { return d_ntaps; } - + + /* float fir_filter_fff::filter(const float input[]) { volk_32f_x2_dot_prod_32f_a(d_output, input, - d_aligned_taps[d_offset], d_ntaps+3); + d_aligned_taps[d_offset], + (d_ntaps + d_offset - 1) / 4 + 1); + //*d_output = float_dotprod_sse(input, d_aligned_taps[d_offset], + // (d_ntaps + d_offset - 1) / 4 + 1); return *d_output; } @@ -126,7 +131,34 @@ namespace gr { j += (d_offset == 0 ? 4 : 0); } } + */ + + float + fir_filter_fff::filter(const float input[]) + { + //unsigned long ar = ((unsigned long) input); + //int off = (ar - (ar & ~15))/4; + + const float *ar = (float *)((unsigned long) input & ~15); + unsigned al = input - ar; + + volk_32f_x2_dot_prod_32f_a(d_output, ar, + d_aligned_taps[al], + (d_ntaps + al - 1) / 4 + 1); + //*d_output = float_dotprod_sse(input, d_aligned_taps[d_offset], + // (d_ntaps + d_offset - 1) / 4 + 1); + return *d_output; + } + void + fir_filter_fff::filterN(float output[], + const float input[], + unsigned long n) + { + for(unsigned long i = 0; i < n; i++) { + output[i] = filter(&input[i]); + } + } void fir_filter_fff::filterNdec(float output[], @@ -136,7 +168,7 @@ namespace gr { { unsigned long j = 0; for(unsigned long i = 0; i < n; i++) { - filterN(&output[i], &input[j], 1); + output[i] = filter(&input[j]); j += decimate; } } diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h index 448b2fdc0..8753ff615 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h @@ -37,26 +37,47 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* const float* aPtr = input; const float* bPtr = taps; - __m128 aVal, bVal, cVal; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; - __m128 dotProdVal = _mm_setzero_ps(); + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 4*4; + bPtr += 4*4; } + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; dotProduct += dotProductVector[1]; @@ -66,6 +87,9 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* number = quarterPoints * 4; for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); } *result = dotProduct; -- cgit