diff options
Diffstat (limited to 'volk')
-rw-r--r-- | volk/apps/volk_profile.cc | 1 | ||||
-rw-r--r-- | volk/include/volk/volk_16u_byteswap_u.h | 76 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 3 |
3 files changed, 79 insertions, 1 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 6244abb35..648f4d878 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -33,6 +33,7 @@ int main(int argc, char *argv[]) { //VOLK_PROFILE(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 10000, &results); //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 10000, &results); VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_16u_byteswap_u, 0, 0, 204600, 10000, &results); VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32f_accumulator_s32f_a, 1e-4, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32f_x2_add_32f_a, 1e-4, 0, 204600, 10000, &results); diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h new file mode 100644 index 000000000..37bed7fce --- /dev/null +++ b/volk/include/volk/volk_16u_byteswap_u.h @@ -0,0 +1,76 @@ +#ifndef INCLUDED_volk_16u_byteswap_u_H +#define INCLUDED_volk_16u_byteswap_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + uint16_t* inputPtr = intsToSwap; + __m128i input, left, right, output; + + const unsigned int eighthPoints = num_points / 8; + for(;number < eighthPoints; number++){ + // Load the 16t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + // Do the two shifts + left = _mm_slli_epi16(input, 8); + right = _mm_srli_epi16(input, 8); + // Or the left and right halves together + output = _mm_or_si128(left, right); + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 8; + } + + // Byteswap any remaining points: + number = eighthPoints*8; + for(; number < num_points; number++){ + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int point; + uint16_t* inputPtr = intsToSwap; + for(point = 0; point < num_points; point++){ + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +extern void volk_16u_byteswap_u_orc_impl(uint16_t* intsToSwap, unsigned int num_points); +static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points){ + volk_16u_byteswap_u_orc_impl(intsToSwap, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16u_byteswap_u_H */ diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 507c78777..2e41c25da 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -20,6 +20,7 @@ VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1); //VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000); //VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000); VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16u_byteswap_u, 0, 0, 20460, 1); //VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 1); VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1); @@ -36,7 +37,7 @@ VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1); VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 2046000, 1); -VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 2046000, 1); +VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 1); VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1); VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1); |