summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Rondeau2012-06-13 14:53:41 -0400
committerTom Rondeau2012-06-13 14:53:41 -0400
commit9e2e896e9d4dbc4627702cde82a48e9ee5136f26 (patch)
treefb62fd58ec90f34e080d1dc2f52b2200b57fa0ed
parent46bd64dc8231402535bad742a74250599e595148 (diff)
downloadgnuradio-9e2e896e9d4dbc4627702cde82a48e9ee5136f26.tar.gz
gnuradio-9e2e896e9d4dbc4627702cde82a48e9ee5136f26.tar.bz2
gnuradio-9e2e896e9d4dbc4627702cde82a48e9ee5136f26.zip
filter: process 4 vectors each time in volk dot_prod to speed up fir filters.
This makes the volk version of the SSE FIR filter the same speed as using the hand-crafted float_dotprod from before.
-rw-r--r--gr-filter/include/filter/fir_filter.h2
-rw-r--r--gr-filter/lib/fir_filter.cc38
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_a.h48
3 files changed, 72 insertions, 16 deletions
diff --git a/gr-filter/include/filter/fir_filter.h b/gr-filter/include/filter/fir_filter.h
index 1eb70f92a..e6194df7b 100644
--- a/gr-filter/include/filter/fir_filter.h
+++ b/gr-filter/include/filter/fir_filter.h
@@ -102,7 +102,7 @@ namespace gr {
gr_complex filter(const gr_complex input[]);
void filterN(gr_complex output[],
const gr_complex input[],
- unsigned long n)
+ unsigned long n);
void filterNdec(gr_complex output[],
const gr_complex input[],
unsigned long n,
diff --git a/gr-filter/lib/fir_filter.cc b/gr-filter/lib/fir_filter.cc
index 098dd8d36..c6e179246 100644
--- a/gr-filter/lib/fir_filter.cc
+++ b/gr-filter/lib/fir_filter.cc
@@ -24,6 +24,7 @@
#include <fft/fft.h>
#include <volk/volk.h>
#include <cstdio>
+#include <float_dotprod_x86.h>
namespace gr {
namespace filter {
@@ -101,12 +102,16 @@ namespace gr {
{
return d_ntaps;
}
-
+
+ /*
float
fir_filter_fff::filter(const float input[])
{
volk_32f_x2_dot_prod_32f_a(d_output, input,
- d_aligned_taps[d_offset], d_ntaps+3);
+ d_aligned_taps[d_offset],
+ (d_ntaps + d_offset - 1) / 4 + 1);
+ //*d_output = float_dotprod_sse(input, d_aligned_taps[d_offset],
+ // (d_ntaps + d_offset - 1) / 4 + 1);
return *d_output;
}
@@ -126,7 +131,34 @@ namespace gr {
j += (d_offset == 0 ? 4 : 0);
}
}
+ */
+
+ float
+ fir_filter_fff::filter(const float input[])
+ {
+ //unsigned long ar = ((unsigned long) input);
+ //int off = (ar - (ar & ~15))/4;
+
+ const float *ar = (float *)((unsigned long) input & ~15);
+ unsigned al = input - ar;
+
+ volk_32f_x2_dot_prod_32f_a(d_output, ar,
+ d_aligned_taps[al],
+ (d_ntaps + al - 1) / 4 + 1);
+ //*d_output = float_dotprod_sse(input, d_aligned_taps[d_offset],
+ // (d_ntaps + d_offset - 1) / 4 + 1);
+ return *d_output;
+ }
+ void
+ fir_filter_fff::filterN(float output[],
+ const float input[],
+ unsigned long n)
+ {
+ for(unsigned long i = 0; i < n; i++) {
+ output[i] = filter(&input[i]);
+ }
+ }
void
fir_filter_fff::filterNdec(float output[],
@@ -136,7 +168,7 @@ namespace gr {
{
unsigned long j = 0;
for(unsigned long i = 0; i < n; i++) {
- filterN(&output[i], &input[j], 1);
+ output[i] = filter(&input[j]);
j += decimate;
}
}
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
index 448b2fdc0..8753ff615 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
@@ -37,26 +37,47 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float*
const float* aPtr = input;
const float* bPtr = taps;
- __m128 aVal, bVal, cVal;
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
- __m128 dotProdVal = _mm_setzero_ps();
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
-
- dotProdVal = _mm_add_ps(cVal, dotProdVal);
-
- aPtr += 4;
- bPtr += 4;
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 4*4;
+ bPtr += 4*4;
}
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
__VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
dotProduct = dotProductVector[0];
dotProduct += dotProductVector[1];
@@ -66,6 +87,9 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float*
number = quarterPoints * 4;
for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
}
*result = dotProduct;