diff options
-rw-r--r-- | volk/apps/volk_profile.cc | 2 | ||||
-rw-r--r-- | volk/include/volk/Makefile.am | 2 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h | 82 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h | 81 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 2 |
5 files changed, 169 insertions, 0 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 712c32bce..6ba7f17bb 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -56,6 +56,8 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32fc_magnitude_squared_32f_u, 1e-4, 0, 204600, 1000, &results); VOLK_PROFILE(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results); VOLK_PROFILE(volk_32fc_x2_multiply_32fc_u, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc_a, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc_u, 1e-4, 0, 204600, 1000, &results); VOLK_PROFILE(volk_32f_s32f_convert_16i_a, 1, 32768, 204600, 10000, &results); VOLK_PROFILE(volk_32f_s32f_convert_16i_u, 1, 32768, 204600, 10000, &results); VOLK_PROFILE(volk_32f_s32f_convert_32i_a, 1, 2<<31, 204600, 10000, &results); diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 20864efbe..d071f18f2 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -59,6 +59,8 @@ volkinclude_HEADERS = \ volk_32fc_32f_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_u.h \ + volk_32fc_s32fc_multiply_conjugate_32fc_a.h \ + volk_32fc_s32fc_multiply_conjugate_32fc_u.h \ volk_32fc_s32f_power_32fc_a.h \ volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \ volk_32fc_s32f_atan2_32f_a.h \ diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h new file mode 100644 index 000000000..70476a8c7 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h @@ -0,0 +1,82 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H +#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m128 conjugator = _mm_setr_ps(1, -1, 1, -1); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + // FIXME: replace with xor for a faster implementation + y = _mm_mul_ps(y, conjugator); // conjugate y + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h new file mode 100644 index 000000000..fbaa29c17 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H +#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m128 conjugator = _mm_set_ps(0, 0x80000000, 0, 0x80000000); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + y = _mm_xor_ps(y, conjugator); // conjugate y + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */ diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index b00ea0b64..fdd3d4853 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -91,6 +91,8 @@ VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a, 1e-4, 100, 20460, 1); VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 20460, 1); VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_u, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc_u, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_u, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_a, 1e-4, 0, 20460, 1); |