diff options
Diffstat (limited to 'volk')
-rw-r--r-- | volk/apps/volk_profile.cc | 1 | ||||
-rw-r--r-- | volk/gen/machines.xml | 6 | ||||
-rw-r--r-- | volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h | 4 | ||||
-rw-r--r-- | volk/include/volk/volk_16u_byteswap_u.h | 63 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 51 | ||||
-rw-r--r-- | volk/lib/CMakeLists.txt | 8 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 5 |
7 files changed, 103 insertions, 35 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 6244abb35..648f4d878 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -33,6 +33,7 @@ int main(int argc, char *argv[]) { //VOLK_PROFILE(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 10000, &results); //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 10000, &results); VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_16u_byteswap_u, 0, 0, 204600, 10000, &results); VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32f_accumulator_s32f_a, 1e-4, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32f_x2_add_32f_a, 1e-4, 0, 204600, 10000, &results); diff --git a/volk/gen/machines.xml b/volk/gen/machines.xml index 8e3c9c8c2..d88a1a50c 100644 --- a/volk/gen/machines.xml +++ b/volk/gen/machines.xml @@ -10,7 +10,7 @@ </machine> <machine name="sse"> -<archs>generic 32|64| mmx sse orc|</archs> +<archs>generic 32|64| mmx| sse orc|</archs> </machine> --> @@ -20,7 +20,7 @@ <!-- trailing | bar means generate without either for MSVC --> <machine name="sse2"> -<archs>generic 32|64| mmx sse sse2 orc|</archs> +<archs>generic 32|64| mmx| sse sse2 orc|</archs> </machine> <machine name="sse3"> @@ -45,7 +45,7 @@ <!-- trailing | bar means generate without either for MSVC --> <machine name="avx"> -<archs>generic 32|64| mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc|</archs> +<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc|</archs> </machine> <machine name="altivec"> diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h index 940aa5de7..1f6554af8 100644 --- a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h @@ -37,7 +37,7 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, cons #endif /*LV_HAVE_GENERIC*/ -#ifdef LV_HAVE_SSE +#if LV_HAVE_SSE && LV_HAVE_MMX static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { @@ -116,7 +116,7 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const *result = *(lv_32fc_t*)(&res[0]); } -#endif /*LV_HAVE_SSE*/ +#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/ #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/ diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h new file mode 100644 index 000000000..8ef627a62 --- /dev/null +++ b/volk/include/volk/volk_16u_byteswap_u.h @@ -0,0 +1,63 @@ +#ifndef INCLUDED_volk_16u_byteswap_u_H +#define INCLUDED_volk_16u_byteswap_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + uint16_t* inputPtr = intsToSwap; + __m128i input, left, right, output; + + const unsigned int eighthPoints = num_points / 8; + for(;number < eighthPoints; number++){ + // Load the 16t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + // Do the two shifts + left = _mm_slli_epi16(input, 8); + right = _mm_srli_epi16(input, 8); + // Or the left and right halves together + output = _mm_or_si128(left, right); + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 8; + } + + // Byteswap any remaining points: + number = eighthPoints*8; + for(; number < num_points; number++){ + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int point; + uint16_t* inputPtr = intsToSwap; + for(point = 0; point < num_points; point++){ + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_16u_byteswap_u_H */ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index ab33a2587..b24e8b1f7 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -48,14 +48,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* for(;number < sixteenthPoints; number++){ - a0Val = _mm_load_ps(aPtr); - a1Val = _mm_load_ps(aPtr+4); - a2Val = _mm_load_ps(aPtr+8); - a3Val = _mm_load_ps(aPtr+12); - b0Val = _mm_load_ps(bPtr); - b1Val = _mm_load_ps(bPtr+4); - b2Val = _mm_load_ps(bPtr+8); - b3Val = _mm_load_ps(bPtr+12); + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr+4); + a2Val = _mm_loadu_ps(aPtr+8); + a3Val = _mm_loadu_ps(aPtr+12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr+4); + b2Val = _mm_loadu_ps(bPtr+8); + b3Val = _mm_loadu_ps(bPtr+12); c0Val = _mm_mul_ps(a0Val, b0Val); c1Val = _mm_mul_ps(a1Val, b1Val); @@ -87,9 +87,6 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* number = sixteenthPoints*16; for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); - dotProduct += ((*aPtr++) * (*bPtr++)); } *result = dotProduct; @@ -121,14 +118,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * for(;number < sixteenthPoints; number++){ - a0Val = _mm_load_ps(aPtr); - a1Val = _mm_load_ps(aPtr+4); - a2Val = _mm_load_ps(aPtr+8); - a3Val = _mm_load_ps(aPtr+12); - b0Val = _mm_load_ps(bPtr); - b1Val = _mm_load_ps(bPtr+4); - b2Val = _mm_load_ps(bPtr+8); - b3Val = _mm_load_ps(bPtr+12); + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr+4); + a2Val = _mm_loadu_ps(aPtr+8); + a3Val = _mm_loadu_ps(aPtr+12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr+4); + b2Val = _mm_loadu_ps(bPtr+8); + b3Val = _mm_loadu_ps(bPtr+12); c0Val = _mm_mul_ps(a0Val, b0Val); c1Val = _mm_mul_ps(a1Val, b1Val); @@ -187,15 +184,15 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float for(;number < sixteenthPoints; number++){ - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - aVal4 = _mm_load_ps(aPtr); aPtr += 4; + aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; - bVal1 = _mm_load_ps(bPtr); bPtr += 4; - bVal2 = _mm_load_ps(bPtr); bPtr += 4; - bVal3 = _mm_load_ps(bPtr); bPtr += 4; - bVal4 = _mm_load_ps(bPtr); bPtr += 4; + bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt index 8288786c9..59d78b446 100644 --- a/volk/lib/CMakeLists.txt +++ b/volk/lib/CMakeLists.txt @@ -135,6 +135,12 @@ if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86) if (${SIZEOF_CPU} EQUAL 32) OVERRULE_ARCH(64 "CPU width is 32 bits") endif() + + #MSVC 64 bit does not have MMX, overrule it + if (${SIZEOF_CPU} EQUAL 64 AND MSVC) + OVERRULE_ARCH(mmx "No MMX for Win64") + endif() + endif() ######################################################################## @@ -159,7 +165,7 @@ execute_process( # When this occurs, eliminate the redundant machines # to avoid unnecessary compilation of subset machines. ######################################################################## -foreach(arch orc 64 32) +foreach(arch mmx orc 64 32) foreach(machine_name ${available_machines}) string(REPLACE "_${arch}" "" machine_name_no_arch ${machine_name}) if (${machine_name} STREQUAL ${machine_name_no_arch}) diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index d1eb1cacb..2e41c25da 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -20,6 +20,7 @@ VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1); //VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000); //VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000); VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16u_byteswap_u, 0, 0, 20460, 1); //VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 1); VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1); @@ -36,7 +37,7 @@ VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1); VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 2046000, 1); -VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 2046000, 1); +VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 1); VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1); VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1); @@ -54,7 +55,7 @@ VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1); -//VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1); VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 1); //VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000); VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1); |