summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gr-filter/lib/qa_fir_filter_with_buffer.cc8
-rw-r--r--volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h4
-rw-r--r--volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h33
-rw-r--r--volk/lib/testqa.cc4
4 files changed, 9 insertions, 40 deletions
diff --git a/gr-filter/lib/qa_fir_filter_with_buffer.cc b/gr-filter/lib/qa_fir_filter_with_buffer.cc
index f8af9ea0b..eaa0359d0 100644
--- a/gr-filter/lib/qa_fir_filter_with_buffer.cc
+++ b/gr-filter/lib/qa_fir_filter_with_buffer.cc
@@ -39,8 +39,6 @@ namespace gr {
#define MAX_DATA (16383)
#define ERR_DELTA (1e-5)
-#define NELEM(x) (sizeof(x) / sizeof(x[0]))
-
static float
uniform()
{
@@ -58,8 +56,8 @@ namespace gr {
random_complex(gr_complex *buf, unsigned n)
{
for(unsigned i = 0; i < n; i++) {
- float re = rint(uniform () * MAX_DATA);
- float im = rint(uniform () * MAX_DATA);
+ float re = rint(uniform() * MAX_DATA);
+ float im = rint(uniform() * MAX_DATA);
buf[i] = gr_complex(re, im);
}
}
@@ -263,7 +261,7 @@ namespace gr {
new kernel::fir_filter_with_buffer_ccc(f1_taps);
// zero the output, then do the filtering
- memset(actual_output, 0, sizeof(actual_output));
+ memset(actual_output, 0, OUTPUT_LEN*sizeof(o_type));
f1->filterNdec(actual_output, input, ol/decimate, decimate);
// check results
diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
index 80c55e75f..05732b1ea 100644
--- a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
+++ b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
@@ -174,12 +174,10 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
- __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p, negated, zeros;
+ __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
- zeros = _mm256_set1_ps(0.0);
- negated = _mm256_set1_ps(-1.0);
const unsigned int fourthPoints = num_points / 4;
diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
index cb2ac4c67..166a883a7 100644
--- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
+++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
@@ -18,40 +18,26 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const
unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
unsigned int isodd = (num_bytes >> 3) &1;
-
-
float sum0[2] = {0,0};
float sum1[2] = {0,0};
unsigned int i = 0;
-
for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-
-
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
sum1[0] += in[2] * tp[2] - in[3] * tp[3];
sum1[1] += in[2] * tp[3] + in[3] * tp[2];
-
in += 4;
tp += 4;
-
}
-
res[0] = sum0[0] + sum1[0];
res[1] = sum0[1] + sum1[1];
-
-
for(i = 0; i < isodd; ++i) {
-
-
*result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1];
-
}
-
}
#endif /*LV_HAVE_GENERIC*/
@@ -177,14 +163,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const
);
- int getem = num_bytes % 16;
-
-
- for(; getem > 0; getem -= 8) {
-
-
+ if(((num_bytes >> 3) & 1)) {
*result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
-
}
return;
@@ -363,7 +343,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv
dotProduct += ( dotProductVector[0] + dotProductVector[1] );
- if((num_bytes >> 2) != 0) {
+ if(((num_bytes >> 3) & 1) != 0) {
dotProduct += (*a) * (*b);
}
@@ -377,9 +357,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv
#include <smmintrin.h>
static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
- volk_32fc_x2_dot_prod_32fc_a_sse3(result, input, taps, num_bytes);
- // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now
- /*
+
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
float *p_input, *p_taps;
__m64 *p_result;
@@ -442,12 +420,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
}
-
-
-
real1 = _mm_xor_ps(real1, (__m128)neg);
-
im0 = _mm_add_ps(im0, im1);
real0 = _mm_add_ps(real0, real1);
@@ -459,7 +433,6 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
*result += input[i] * taps[i];
}
- */
}
#endif /*LV_HAVE_SSE4_1*/
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 813e62217..d1eb1cacb 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -35,8 +35,8 @@ VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
-VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 2046000, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 2046000, 1);
VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1);