summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_a.h105
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_u.h102
2 files changed, 154 insertions, 53 deletions
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
index 0543227b0..c26fd5e7c 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
@@ -8,17 +8,14 @@
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
float dotProduct = 0;
const float* aPtr = input;
const float* bPtr= taps;
unsigned int number = 0;
- for(number = 0; number < num_4x_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
+ for(number = 0; number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -31,10 +28,10 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa
#ifdef LV_HAVE_SSE
-static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_4x_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
unsigned int number = 0;
- const unsigned int quarterPoints = num_4x_points / 4;
+ const unsigned int sixteenthPoints = num_points / 16;
float dotProduct = 0;
const float* aPtr = input;
@@ -49,7 +46,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float*
__m128 dotProdVal2 = _mm_setzero_ps();
__m128 dotProdVal3 = _mm_setzero_ps();
- for(;number < quarterPoints; number++){
+ for(;number < sixteenthPoints; number++){
a0Val = _mm_load_ps(aPtr);
a1Val = _mm_load_ps(aPtr+4);
@@ -87,11 +84,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float*
dotProduct += dotProductVector[2];
dotProduct += dotProductVector[3];
- number = quarterPoints*4;
- for(;number < num_4x_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -105,9 +99,9 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float*
#include <pmmintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
unsigned int number = 0;
- const unsigned int quarterPoints = num_4x_points / 4;
+ const unsigned int sixteenthPoints = num_points / 16;
float dotProduct = 0;
const float* aPtr = input;
@@ -122,7 +116,7 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float *
__m128 dotProdVal2 = _mm_setzero_ps();
__m128 dotProdVal3 = _mm_setzero_ps();
- for(;number < quarterPoints; number++){
+ for(;number < sixteenthPoints; number++){
a0Val = _mm_load_ps(aPtr);
a1Val = _mm_load_ps(aPtr+4);
@@ -159,11 +153,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float *
dotProduct += dotProductVector[2];
dotProduct += dotProductVector[3];
- number = quarterPoints*4;
- for(;number < num_4x_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -176,9 +167,9 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float *
#include <smmintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_4x_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
unsigned int number = 0;
- const unsigned int sixteenthPoints = num_4x_points / 4;
+ const unsigned int sixteenthPoints = num_points / 16;
float dotProduct = 0;
const float* aPtr = input;
@@ -223,11 +214,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float
dotProduct += dotProductVector[2];
dotProduct += dotProductVector[3];
- number = sixteenthPoints * 4;
- for(;number < num_4x_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -236,4 +224,65 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float
#endif /*LV_HAVE_SSE4_1*/
+#ifdef LV_HAVE_AVX
+
+static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val;
+ __m256 b0Val, b1Val;
+ __m256 c0Val, c1Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_load_ps(aPtr);
+ a1Val = _mm256_load_ps(aPtr+8);
+ b0Val = _mm256_load_ps(bPtr);
+ b1Val = _mm256_load_ps(bPtr+8);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
index dfafe2239..f9ae15094 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -8,17 +8,14 @@
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
float dotProduct = 0;
const float* aPtr = input;
const float* bPtr= taps;
unsigned int number = 0;
- for(number = 0; number < num_4x_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
+ for(number = 0; number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -31,10 +28,10 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa
#ifdef LV_HAVE_SSE
-static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_4x_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
unsigned int number = 0;
- const unsigned int quarterPoints = num_4x_points / 4;
+ const unsigned int sixteenthPoints = num_points / 16;
float dotProduct = 0;
const float* aPtr = input;
@@ -49,7 +46,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
__m128 dotProdVal2 = _mm_setzero_ps();
__m128 dotProdVal3 = _mm_setzero_ps();
- for(;number < quarterPoints; number++){
+ for(;number < sixteenthPoints; number++){
a0Val = _mm_load_ps(aPtr);
a1Val = _mm_load_ps(aPtr+4);
@@ -87,8 +84,8 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
dotProduct += dotProductVector[2];
dotProduct += dotProductVector[3];
- number = quarterPoints*4;
- for(;number < num_4x_points; number++){
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
dotProduct += ((*aPtr++) * (*bPtr++));
dotProduct += ((*aPtr++) * (*bPtr++));
@@ -105,9 +102,9 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
#include <pmmintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
unsigned int number = 0;
- const unsigned int quarterPoints = num_4x_points / 4;
+ const unsigned int sixteenthPoints = num_points / 16;
float dotProduct = 0;
const float* aPtr = input;
@@ -122,7 +119,7 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float *
__m128 dotProdVal2 = _mm_setzero_ps();
__m128 dotProdVal3 = _mm_setzero_ps();
- for(;number < quarterPoints; number++){
+ for(;number < sixteenthPoints; number++){
a0Val = _mm_load_ps(aPtr);
a1Val = _mm_load_ps(aPtr+4);
@@ -159,11 +156,8 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float *
dotProduct += dotProductVector[2];
dotProduct += dotProductVector[3];
- number = quarterPoints*4;
- for(;number < num_4x_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -176,9 +170,9 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float *
#include <smmintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_4x_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
unsigned int number = 0;
- const unsigned int sixteenthPoints = num_4x_points / 4;
+ const unsigned int sixteenthPoints = num_points / 16;
float dotProduct = 0;
const float* aPtr = input;
@@ -223,11 +217,8 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
dotProduct += dotProductVector[2];
dotProduct += dotProductVector[3];
- number = sixteenthPoints * 4;
- for(;number < num_4x_points; number++){
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
+ number = sixteenthPoints * 16;
+ for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -236,4 +227,65 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
#endif /*LV_HAVE_SSE4_1*/
+#ifdef LV_HAVE_AVX
+
+static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const float* input, const float* taps, unsigned int num_points) {
+
+ unsigned int number = 0;
+ const unsigned int sixteenthPoints = num_points / 16;
+
+ float dotProduct = 0;
+ const float* aPtr = input;
+ const float* bPtr = taps;
+
+ __m256 a0Val, a1Val;
+ __m256 b0Val, b1Val;
+ __m256 c0Val, c1Val;
+
+ __m256 dotProdVal0 = _mm256_setzero_ps();
+ __m256 dotProdVal1 = _mm256_setzero_ps();
+
+ for(;number < sixteenthPoints; number++){
+
+ a0Val = _mm256_loadu_ps(aPtr);
+ a1Val = _mm256_loadu_ps(aPtr+8);
+ b0Val = _mm256_loadu_ps(bPtr);
+ b1Val = _mm256_loadu_ps(bPtr+8);
+
+ c0Val = _mm256_mul_ps(a0Val, b0Val);
+ c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+ dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+ aPtr += 16;
+ bPtr += 16;
+ }
+
+ dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+ __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+ _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+ dotProduct = dotProductVector[0];
+ dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
+ dotProduct += dotProductVector[4];
+ dotProduct += dotProductVector[5];
+ dotProduct += dotProductVector[6];
+ dotProduct += dotProductVector[7];
+
+ number = sixteenthPoints*16;
+ for(;number < num_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ }
+
+ *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/