summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_a.h84
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_u.h151
2 files changed, 160 insertions, 75 deletions
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
index 8753ff615..0543227b0 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
@@ -8,14 +8,17 @@
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
float dotProduct = 0;
const float* aPtr = input;
const float* bPtr= taps;
unsigned int number = 0;
- for(number = 0; number < num_points; number++){
+ for(number = 0; number < num_4x_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -28,10 +31,10 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa
#ifdef LV_HAVE_SSE
-static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* input, const float* taps, unsigned int num_4x_points) {
unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_4x_points / 4;
float dotProduct = 0;
const float* aPtr = input;
@@ -67,8 +70,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float*
dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
- aPtr += 4*4;
- bPtr += 4*4;
+ aPtr += 16;
+ bPtr += 16;
}
dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
@@ -84,8 +87,8 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float*
dotProduct += dotProductVector[2];
dotProduct += dotProductVector[3];
- number = quarterPoints * 4;
- for(;number < num_points; number++){
+ number = quarterPoints*4;
+ for(;number < num_4x_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
dotProduct += ((*aPtr++) * (*bPtr++));
dotProduct += ((*aPtr++) * (*bPtr++));
@@ -102,41 +105,65 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float*
#include <pmmintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_4x_points / 4;
float dotProduct = 0;
const float* aPtr = input;
const float* bPtr = taps;
- __m128 aVal, bVal, cVal;
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
- __m128 dotProdVal = _mm_setzero_ps();
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
- cVal = _mm_mul_ps(aVal, bVal);
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
- dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
- aPtr += 4;
- bPtr += 4;
+ aPtr += 16;
+ bPtr += 16;
}
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
- _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
dotProduct = dotProductVector[0];
dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
- number = quarterPoints * 4;
- for(;number < num_points; number++){
+ number = quarterPoints*4;
+ for(;number < num_4x_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -149,9 +176,9 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float *
#include <smmintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float * input, const float* taps, unsigned int num_4x_points) {
unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ const unsigned int sixteenthPoints = num_4x_points / 4;
float dotProduct = 0;
const float* aPtr = input;
@@ -196,8 +223,11 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float
dotProduct += dotProductVector[2];
dotProduct += dotProductVector[3];
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
+ number = sixteenthPoints * 4;
+ for(;number < num_4x_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
dotProduct += ((*aPtr++) * (*bPtr++));
}
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
index 3b7284b57..dfafe2239 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -1,20 +1,24 @@
#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+#include <volk/volk_common.h>
#include<stdio.h>
#ifdef LV_HAVE_GENERIC
-static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
float dotProduct = 0;
const float* aPtr = input;
const float* bPtr= taps;
unsigned int number = 0;
- for(number = 0; number < num_points; number++){
+ for(number = 0; number < num_4x_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -27,43 +31,67 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa
#ifdef LV_HAVE_SSE
-static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_4x_points) {
unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_4x_points / 4;
float dotProduct = 0;
const float* aPtr = input;
const float* bPtr = taps;
- __m128 aVal, bVal, cVal;
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
- __m128 dotProdVal = _mm_setzero_ps();
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
for(;number < quarterPoints; number++){
- aVal = _mm_loadu_ps(aPtr);
- bVal = _mm_loadu_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
-
- dotProdVal = _mm_add_ps(cVal, dotProdVal);
-
- aPtr += 4;
- bPtr += 4;
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 16;
+ bPtr += 16;
}
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
__VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
dotProduct = dotProductVector[0];
dotProduct += dotProductVector[1];
dotProduct += dotProductVector[2];
dotProduct += dotProductVector[3];
- number = quarterPoints * 4;
- for(;number < num_points; number++){
+ number = quarterPoints*4;
+ for(;number < num_4x_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -77,41 +105,65 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
#include <pmmintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_4x_points) {
unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
+ const unsigned int quarterPoints = num_4x_points / 4;
float dotProduct = 0;
const float* aPtr = input;
const float* bPtr = taps;
- __m128 aVal, bVal, cVal;
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
- __m128 dotProdVal = _mm_setzero_ps();
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
for(;number < quarterPoints; number++){
- aVal = _mm_loadu_ps(aPtr);
- bVal = _mm_loadu_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
-
- dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
-
- aPtr += 4;
- bPtr += 4;
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+ dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+ dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+ dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+ aPtr += 16;
+ bPtr += 16;
}
- __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
- _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
dotProduct = dotProductVector[0];
dotProduct += dotProductVector[1];
+ dotProduct += dotProductVector[2];
+ dotProduct += dotProductVector[3];
- number = quarterPoints * 4;
- for(;number < num_points; number++){
+ number = quarterPoints*4;
+ for(;number < num_4x_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
dotProduct += ((*aPtr++) * (*bPtr++));
}
@@ -124,9 +176,9 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float *
#include <smmintrin.h>
-static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_4x_points) {
unsigned int number = 0;
- const unsigned int sixteenthPoints = num_points / 16;
+ const unsigned int sixteenthPoints = num_4x_points / 4;
float dotProduct = 0;
const float* aPtr = input;
@@ -141,15 +193,15 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
for(;number < sixteenthPoints; number++){
- aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
- aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_load_ps(aPtr); aPtr += 4;
- bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
- bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_load_ps(bPtr); bPtr += 4;
cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
@@ -171,8 +223,11 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
dotProduct += dotProductVector[2];
dotProduct += dotProductVector[3];
- number = sixteenthPoints * 16;
- for(;number < num_points; number++){
+ number = sixteenthPoints * 4;
+ for(;number < num_4x_points; number++){
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
dotProduct += ((*aPtr++) * (*bPtr++));
}