diff options
-rw-r--r-- | volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 84 | ||||
-rw-r--r-- | volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 151 |
2 files changed, 160 insertions, 75 deletions
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h index 8753ff615..0543227b0 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h @@ -8,14 +8,17 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_4x_points) { float dotProduct = 0; const float* aPtr = input; const float* bPtr= taps; unsigned int number = 0; - for(number = 0; number < num_points; number++){ + for(number = 0; number < num_4x_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -28,10 +31,10 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa #ifdef LV_HAVE_SSE -static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_4x_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_4x_points / 4; float dotProduct = 0; const float* aPtr = input; @@ -67,8 +70,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); - aPtr += 4*4; - bPtr += 4*4; + aPtr += 16; + bPtr += 16; } dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); @@ -84,8 +87,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = quarterPoints * 4; - for(;number < num_points; number++){ + number = quarterPoints*4; + for(;number < num_4x_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); @@ -102,41 +105,65 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* #include <pmmintrin.h> -static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_4x_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_4x_points / 4; float dotProduct = 0; const float* aPtr = input; const float* bPtr = taps; - __m128 aVal, bVal, cVal; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; - __m128 dotProdVal = _mm_setzero_ps(); + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); - cVal = _mm_mul_ps(aVal, bVal); + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); + dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); + dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); + dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); - aPtr += 4; - bPtr += 4; + aPtr += 16; + bPtr += 16; } - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; - number = quarterPoints * 4; - for(;number < num_points; number++){ + number = quarterPoints*4; + for(;number < num_4x_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -149,9 +176,9 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * #include <smmintrin.h> -static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_4x_points) { unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + const unsigned int sixteenthPoints = num_4x_points / 4; float dotProduct = 0; const float* aPtr = input; @@ -196,8 +223,11 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = sixteenthPoints * 16; - for(;number < num_points; number++){ + number = sixteenthPoints * 4; + for(;number < num_4x_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); } diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h index 3b7284b57..dfafe2239 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -1,20 +1,24 @@ #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H +#include <volk/volk_common.h> #include<stdio.h> #ifdef LV_HAVE_GENERIC -static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_4x_points) { float dotProduct = 0; const float* aPtr = input; const float* bPtr= taps; unsigned int number = 0; - for(number = 0; number < num_points; number++){ + for(number = 0; number < num_4x_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -27,43 +31,67 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa #ifdef LV_HAVE_SSE -static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_4x_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_4x_points / 4; float dotProduct = 0; const float* aPtr = input; const float* bPtr = taps; - __m128 aVal, bVal, cVal; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; - __m128 dotProdVal = _mm_setzero_ps(); + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 16; + bPtr += 16; } + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; dotProduct += dotProductVector[1]; dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = quarterPoints * 4; - for(;number < num_points; number++){ + number = quarterPoints*4; + for(;number < num_4x_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -77,41 +105,65 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* #include <pmmintrin.h> -static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_4x_points) { unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; + const unsigned int quarterPoints = num_4x_points / 4; float dotProduct = 0; const float* aPtr = input; const float* bPtr = taps; - __m128 aVal, bVal, cVal; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; - __m128 dotProdVal = _mm_setzero_ps(); + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); - - aPtr += 4; - bPtr += 4; + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val); + dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val); + dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val); + dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val); + + aPtr += 16; + bPtr += 16; } - __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; - number = quarterPoints * 4; - for(;number < num_points; number++){ + number = quarterPoints*4; + for(;number < num_4x_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); } @@ -124,9 +176,9 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * #include <smmintrin.h> -static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { +static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_4x_points) { unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; + const unsigned int sixteenthPoints = num_4x_points / 4; float dotProduct = 0; const float* aPtr = input; @@ -141,15 +193,15 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float for(;number < sixteenthPoints; number++){ - aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + aVal4 = _mm_load_ps(aPtr); aPtr += 4; - bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal1 = _mm_load_ps(bPtr); bPtr += 4; + bVal2 = _mm_load_ps(bPtr); bPtr += 4; + bVal3 = _mm_load_ps(bPtr); bPtr += 4; + bVal4 = _mm_load_ps(bPtr); bPtr += 4; cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); @@ -171,8 +223,11 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float dotProduct += dotProductVector[2]; dotProduct += dotProductVector[3]; - number = sixteenthPoints * 16; - for(;number < num_points; number++){ + number = sixteenthPoints * 4; + for(;number < num_4x_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); dotProduct += ((*aPtr++) * (*bPtr++)); } |