diff options
Diffstat (limited to 'volk')
-rw-r--r-- | volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h | 4 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h | 33 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 4 |
3 files changed, 6 insertions, 35 deletions
diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h index 80c55e75f..05732b1ea 100644 --- a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h @@ -174,12 +174,10 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2])); printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3])); printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ - __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p, negated, zeros; + __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; phase_Val = _mm256_loadu_ps((float*)phase_Ptr); inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); - zeros = _mm256_set1_ps(0.0); - negated = _mm256_set1_ps(-1.0); const unsigned int fourthPoints = num_points / 4; diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h index cb2ac4c67..166a883a7 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h @@ -18,40 +18,26 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const unsigned int n_2_ccomplex_blocks = num_bytes >> 4; unsigned int isodd = (num_bytes >> 3) &1; - - float sum0[2] = {0,0}; float sum1[2] = {0,0}; unsigned int i = 0; - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; sum0[1] += in[0] * tp[1] + in[1] * tp[0]; sum1[0] += in[2] * tp[2] - in[3] * tp[3]; sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - in += 4; tp += 4; - } - res[0] = sum0[0] + sum1[0]; res[1] = sum0[1] + sum1[1]; - - for(i = 0; i < isodd; ++i) { - - *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; - } - } #endif /*LV_HAVE_GENERIC*/ @@ -177,14 +163,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const ); - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - + if(((num_bytes >> 3) & 1)) { *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - } return; @@ -363,7 +343,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv dotProduct += ( dotProductVector[0] + dotProductVector[1] ); - if((num_bytes >> 2) != 0) { + if(((num_bytes >> 3) & 1) != 0) { dotProduct += (*a) * (*b); } @@ -377,9 +357,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv #include <smmintrin.h> static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - volk_32fc_x2_dot_prod_32fc_a_sse3(result, input, taps, num_bytes); - // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now - /* + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; float *p_input, *p_taps; __m64 *p_result; @@ -442,12 +420,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const } - - - real1 = _mm_xor_ps(real1, (__m128)neg); - im0 = _mm_add_ps(im0, im1); real0 = _mm_add_ps(real0, real1); @@ -459,7 +433,6 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const *result += input[i] * taps[i]; } - */ } #endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 813e62217..d1eb1cacb 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -35,8 +35,8 @@ VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1); VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 1); -VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 2046000, 1); +VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 2046000, 1); VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1); VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1); |