diff options
Diffstat (limited to 'volk')
-rw-r--r-- | volk/apps/volk_profile.cc | 4 | ||||
-rw-r--r-- | volk/gen/machines.xml | 6 | ||||
-rw-r--r-- | volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h | 122 | ||||
-rw-r--r-- | volk/include/volk/volk_16u_byteswap_u.h | 63 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_dot_prod_16i_a.h | 98 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 169 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 170 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h | 111 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h | 4 | ||||
-rw-r--r-- | volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h | 35 | ||||
-rw-r--r-- | volk/include/volk/volk_32u_byteswap_u.h | 77 | ||||
-rw-r--r-- | volk/include/volk/volk_64u_byteswap_u.h | 88 | ||||
-rw-r--r-- | volk/lib/CMakeLists.txt | 8 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 6 |
14 files changed, 858 insertions, 103 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 0b4442108..648f4d878 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -33,6 +33,8 @@ int main(int argc, char *argv[]) { //VOLK_PROFILE(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 10000, &results); //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 10000, &results); VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_16u_byteswap_u, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32f_accumulator_s32f_a, 1e-4, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32f_x2_add_32f_a, 1e-4, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32f_x2_add_32f_u, 1e-4, 0, 204600, 10000, &results); @@ -49,6 +51,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 204600, 5000, &results); VOLK_PROFILE(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 204600, 1000, &results); VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32fc_index_max_16u_a, 3, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 204600, 100, &results); VOLK_PROFILE(volk_32fc_magnitude_32f_a, 1e-4, 0, 204600, 1000, &results); @@ -76,6 +79,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32f_x2_divide_32f_a, 1e-4, 0, 204600, 2000, &results); VOLK_PROFILE(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 5000, &results); VOLK_PROFILE(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 5000, &results); //VOLK_PROFILE(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000, &results); VOLK_PROFILE(volk_32f_index_max_16u_a, 3, 0, 204600, 5000, &results); VOLK_PROFILE(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 204600, 3000, &results); diff --git a/volk/gen/machines.xml b/volk/gen/machines.xml index 8e3c9c8c2..d88a1a50c 100644 --- a/volk/gen/machines.xml +++ b/volk/gen/machines.xml @@ -10,7 +10,7 @@ </machine> <machine name="sse"> -<archs>generic 32|64| mmx sse orc|</archs> +<archs>generic 32|64| mmx| sse orc|</archs> </machine> --> @@ -20,7 +20,7 @@ <!-- trailing | bar means generate without either for MSVC --> <machine name="sse2"> -<archs>generic 32|64| mmx sse sse2 orc|</archs> +<archs>generic 32|64| mmx| sse sse2 orc|</archs> </machine> <machine name="sse3"> @@ -45,7 +45,7 @@ <!-- trailing | bar means generate without either for MSVC --> <machine name="avx"> -<archs>generic 32|64| mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc|</archs> +<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc|</archs> </machine> <machine name="altivec"> diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h new file mode 100644 index 000000000..1f6554af8 --- /dev/null +++ b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H +#define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H + +#include <volk/volk_common.h> +#include<stdio.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) { + + static const int N_UNROLL = 4; + + lv_32fc_t acc0 = 0; + lv_32fc_t acc1 = 0; + lv_32fc_t acc2 = 0; + lv_32fc_t acc3 = 0; + + unsigned i = 0; + unsigned n = (num_points / N_UNROLL) * N_UNROLL; + + for(i = 0; i < n; i += N_UNROLL) { + acc0 += taps[i + 0] * (float)input[i + 0]; + acc1 += taps[i + 1] * (float)input[i + 1]; + acc2 += taps[i + 2] * (float)input[i + 2]; + acc3 += taps[i + 3] * (float)input[i + 3]; + } + + for(; i < num_points; i++) { + acc0 += taps[i] * (float)input[i]; + } + + *result = acc0 + acc1 + acc2 + acc3; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_MMX + + +static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 8; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const short* aPtr = input; + const float* bPtr = (float*)taps; + + __m64 m0, m1; + __m128 f0, f1, f2, f3; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0)); + m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4)); + f0 = _mm_cvtpi16_ps(m0); + f1 = _mm_cvtpi16_ps(m0); + f2 = _mm_cvtpi16_ps(m1); + f3 = _mm_cvtpi16_ps(m1); + + a0Val = _mm_unpacklo_ps(f0, f1); + a1Val = _mm_unpackhi_ps(f0, f1); + a2Val = _mm_unpacklo_ps(f2, f3); + a3Val = _mm_unpackhi_ps(f2, f3); + + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 8; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + + number = sixteenthPoints*8; + for(;number < num_points; number++){ + *realpt += ((*aPtr) * (*bPtr++)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); +} + +#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/ + + +#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/ diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h new file mode 100644 index 000000000..8ef627a62 --- /dev/null +++ b/volk/include/volk/volk_16u_byteswap_u.h @@ -0,0 +1,63 @@ +#ifndef INCLUDED_volk_16u_byteswap_u_H +#define INCLUDED_volk_16u_byteswap_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + uint16_t* inputPtr = intsToSwap; + __m128i input, left, right, output; + + const unsigned int eighthPoints = num_points / 8; + for(;number < eighthPoints; number++){ + // Load the 16t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + // Do the two shifts + left = _mm_slli_epi16(input, 8); + right = _mm_srli_epi16(input, 8); + // Or the left and right halves together + output = _mm_or_si128(left, right); + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 8; + } + + // Byteswap any remaining points: + number = eighthPoints*8; + for(; number < num_points; number++){ + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an unaligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int point; + uint16_t* inputPtr = intsToSwap; + for(point = 0; point < num_points; point++){ + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_16u_byteswap_u_H */ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h new file mode 100644 index 000000000..961c2418c --- /dev/null +++ b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h @@ -0,0 +1,98 @@ +#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_a_H +#define INCLUDED_volk_32f_x2_dot_prod_16i_a_H + +#include <volk/volk_common.h> +#include<stdio.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (int16_t)dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE + + +static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = (short)dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_a_H*/ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h index 448b2fdc0..067c33ad8 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h @@ -31,39 +31,60 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int sixteenthPoints = num_points / 16; float dotProduct = 0; const float* aPtr = input; const float* bPtr = taps; - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); + for(;number < sixteenthPoints; number++){ - aPtr += 4; - bPtr += 4; + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 16; } + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; dotProduct += dotProductVector[1]; dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = quarterPoints * 4; + number = sixteenthPoints*16; for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -80,38 +101,59 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int sixteenthPoints = num_points / 16; float dotProduct = 0; const float* aPtr = input; const float* bPtr = taps; - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + for(;number < sixteenthPoints; number++){ - aPtr += 4; - bPtr += 4; + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); + dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); + dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); + dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); + + aPtr += 16; + bPtr += 16; } - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; - number = quarterPoints * 4; + number = sixteenthPoints*16; for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -182,4 +224,67 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float #endif /*LV_HAVE_SSE4_1*/ +#ifdef LV_HAVE_AVX + +#include <immintrin.h> + +static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val; + __m256 b0Val, b1Val; + __m256 c0Val, c1Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm256_load_ps(aPtr); + a1Val = _mm256_load_ps(aPtr+8); + b0Val = _mm256_load_ps(bPtr); + b1Val = _mm256_load_ps(bPtr+8); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_AVX*/ + #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index 3b7284b57..b24e8b1f7 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H +#include <volk/volk_common.h> #include<stdio.h> @@ -30,39 +31,60 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int sixteenthPoints = num_points / 16; float dotProduct = 0; const float* aPtr = input; const float* bPtr = taps; - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); + for(;number < sixteenthPoints; number++){ - aPtr += 4; - bPtr += 4; + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr+4); + a2Val = _mm_loadu_ps(aPtr+8); + a3Val = _mm_loadu_ps(aPtr+12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr+4); + b2Val = _mm_loadu_ps(bPtr+8); + b3Val = _mm_loadu_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 16; } + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; dotProduct += dotProductVector[1]; dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = quarterPoints * 4; + number = sixteenthPoints*16; for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -79,38 +101,59 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int sixteenthPoints = num_points / 16; float dotProduct = 0; const float* aPtr = input; const float* bPtr = taps; - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + for(;number < sixteenthPoints; number++){ - aPtr += 4; - bPtr += 4; + a0Val = _mm_loadu_ps(aPtr); + a1Val = _mm_loadu_ps(aPtr+4); + a2Val = _mm_loadu_ps(aPtr+8); + a3Val = _mm_loadu_ps(aPtr+12); + b0Val = _mm_loadu_ps(bPtr); + b1Val = _mm_loadu_ps(bPtr+4); + b2Val = _mm_loadu_ps(bPtr+8); + b3Val = _mm_loadu_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); + dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); + dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); + dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); + + aPtr += 16; + bPtr += 16; } - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; - number = quarterPoints * 4; + number = sixteenthPoints*16; for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -181,4 +224,67 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float #endif /*LV_HAVE_SSE4_1*/ +#ifdef LV_HAVE_AVX + +#include <immintrin.h> + +static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m256 a0Val, a1Val; + __m256 b0Val, b1Val; + __m256 c0Val, c1Val; + + __m256 dotProdVal0 = _mm256_setzero_ps(); + __m256 dotProdVal1 = _mm256_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm256_loadu_ps(aPtr); + a1Val = _mm256_loadu_ps(aPtr+8); + b0Val = _mm256_loadu_ps(bPtr); + b1Val = _mm256_loadu_ps(bPtr+8); + + c0Val = _mm256_mul_ps(a0Val, b0Val); + c1Val = _mm256_mul_ps(a1Val, b1Val); + + dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1); + + aPtr += 16; + bPtr += 16; + } + + dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1); + + __VOLK_ATTR_ALIGNED(32) float dotProductVector[8]; + + _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + dotProduct += dotProductVector[4]; + dotProduct += dotProductVector[5]; + dotProduct += dotProductVector[6]; + dotProduct += dotProductVector[7]; + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_AVX*/ + #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/ diff --git a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h new file mode 100644 index 000000000..109b787e8 --- /dev/null +++ b/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h @@ -0,0 +1,111 @@ +#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H +#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H + +#include <volk/volk_common.h> +#include<stdio.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32fc_32f_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) { + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* aPtr = (float*)input; + const float* bPtr= taps; + unsigned int number = 0; + + *realpt = 0; + *imagpt = 0; + + for(number = 0; number < num_points; number++){ + *realpt += ((*aPtr++) * (*bPtr)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE + + +static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 8; + + float res[2]; + float *realpt = &res[0], *imagpt = &res[1]; + const float* aPtr = (float*)input; + const float* bPtr = taps; + + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 x0Val, x1Val, x2Val, x3Val; + __m128 c0Val, c1Val, c2Val, c3Val; + + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + + x0Val = _mm_load_ps(bPtr); + x1Val = _mm_load_ps(bPtr); + x2Val = _mm_load_ps(bPtr+4); + x3Val = _mm_load_ps(bPtr+4); + b0Val = _mm_unpacklo_ps(x0Val, x1Val); + b1Val = _mm_unpackhi_ps(x0Val, x1Val); + b2Val = _mm_unpacklo_ps(x2Val, x3Val); + b3Val = _mm_unpackhi_ps(x2Val, x3Val); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 8; + } + + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector + + *realpt = dotProductVector[0]; + *imagpt = dotProductVector[1]; + *realpt += dotProductVector[2]; + *imagpt += dotProductVector[3]; + + number = sixteenthPoints*8; + for(;number < num_points; number++){ + *realpt += ((*aPtr++) * (*bPtr)); + *imagpt += ((*aPtr++) * (*bPtr++)); + } + + *result = *(lv_32fc_t*)(&res[0]); +} + +#endif /*LV_HAVE_SSE*/ + + +#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H*/ diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h index 08db99373..51b6041ec 100644 --- a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h @@ -171,12 +171,10 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2])); printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3])); printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/ - __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p, negated, zeros; + __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p; phase_Val = _mm256_loadu_ps((float*)phase_Ptr); inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr)); - zeros = _mm256_set1_ps(0.0); - negated = _mm256_set1_ps(-1.0); const unsigned int fourthPoints = num_points / 4; diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h index cb2ac4c67..caef3e6f0 100644 --- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h @@ -18,40 +18,26 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const unsigned int n_2_ccomplex_blocks = num_bytes >> 4; unsigned int isodd = (num_bytes >> 3) &1; - - float sum0[2] = {0,0}; float sum1[2] = {0,0}; unsigned int i = 0; - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; sum0[1] += in[0] * tp[1] + in[1] * tp[0]; sum1[0] += in[2] * tp[2] - in[3] * tp[3]; sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - in += 4; tp += 4; - } - res[0] = sum0[0] + sum1[0]; res[1] = sum0[1] + sum1[1]; - - for(i = 0; i < isodd; ++i) { - - *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; - } - } #endif /*LV_HAVE_GENERIC*/ @@ -177,14 +163,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const ); - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - + if(((num_bytes >> 3) & 1)) { *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - } return; @@ -363,7 +343,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv dotProduct += ( dotProductVector[0] + dotProductVector[1] ); - if((num_bytes >> 2) != 0) { + if(((num_bytes >> 3) & 1) != 0) { dotProduct += (*a) * (*b); } @@ -377,9 +357,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv #include <smmintrin.h> static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - volk_32fc_x2_dot_prod_32fc_a_sse3(result, input, taps, num_bytes); - // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now - /* + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; float *p_input, *p_taps; __m64 *p_result; @@ -442,11 +420,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const } - - - - real1 = _mm_xor_ps(real1, (__m128)neg); - + real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); im0 = _mm_add_ps(im0, im1); real0 = _mm_add_ps(real0, real1); @@ -459,7 +433,6 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const *result += input[i] * taps[i]; } - */ } #endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/include/volk/volk_32u_byteswap_u.h b/volk/include/volk/volk_32u_byteswap_u.h new file mode 100644 index 000000000..e27d1f03d --- /dev/null +++ b/volk/include/volk/volk_32u_byteswap_u.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_32u_byteswap_u_H +#define INCLUDED_volk_32u_byteswap_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + + uint32_t* inputPtr = intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = quarterPoints*4; + for(; number < num_points; number++){ + uint32_t outputVal = *inputPtr; + outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_u_generic(uint32_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = intsToSwap; + + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32u_byteswap_u_H */ diff --git a/volk/include/volk/volk_64u_byteswap_u.h b/volk/include/volk/volk_64u_byteswap_u.h new file mode 100644 index 000000000..41a4a3130 --- /dev/null +++ b/volk/include/volk/volk_64u_byteswap_u.h @@ -0,0 +1,88 @@ +#ifndef INCLUDED_volk_64u_byteswap_u_H +#define INCLUDED_volk_64u_byteswap_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE2 +#include <emmintrin.h> + +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + uint64_t number = 0; + const unsigned int halfPoints = num_points / 2; + for(;number < halfPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_loadu_si128((__m128i*)inputPtr); + + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + + // Reorder the two words + output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); + + // Store the results + _mm_storeu_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = halfPoints*2; + for(; number < num_points; number++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_u_generic(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64u_byteswap_u_H */ diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt index 328a4e087..d9aeb797c 100644 --- a/volk/lib/CMakeLists.txt +++ b/volk/lib/CMakeLists.txt @@ -150,6 +150,12 @@ if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86) if (${SIZEOF_CPU} EQUAL 32) OVERRULE_ARCH(64 "CPU width is 32 bits") endif() + + #MSVC 64 bit does not have MMX, overrule it + if (${SIZEOF_CPU} EQUAL 64 AND MSVC) + OVERRULE_ARCH(mmx "No MMX for Win64") + endif() + endif() ######################################################################## @@ -174,7 +180,7 @@ execute_process( # When this occurs, eliminate the redundant machines # to avoid unnecessary compilation of subset machines. ######################################################################## -foreach(arch orc 64 32) +foreach(arch mmx orc 64 32) foreach(machine_name ${available_machines}) string(REPLACE "_${arch}" "" machine_name_no_arch ${machine_name}) if (${machine_name} STREQUAL ${machine_name_no_arch}) diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index aac676729..2e41c25da 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -20,6 +20,8 @@ VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1); //VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000); //VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000); VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16u_byteswap_u, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 1); VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_add_32f_u, 1e-4, 0, 20460, 1); @@ -34,7 +36,8 @@ VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1); VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 2046000, 1); +VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 1); VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1); VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1); @@ -53,6 +56,7 @@ VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1); VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 1); //VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000); VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32767, 20460, 1); |