summaryrefslogtreecommitdiff
path: root/volk
diff options
context:
space:
mode:
Diffstat (limited to 'volk')
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_a.h48
1 files changed, 36 insertions, 12 deletions
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
index 448b2fdc0..8753ff615 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
@@ -37,26 +37,47 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float*
const float* aPtr = input;
const float* bPtr = taps;
- __m128 aVal, bVal, cVal;
+ __m128 a0Val, a1Val, a2Val, a3Val;
+ __m128 b0Val, b1Val, b2Val, b3Val;
+ __m128 c0Val, c1Val, c2Val, c3Val;
- __m128 dotProdVal = _mm_setzero_ps();
+ __m128 dotProdVal0 = _mm_setzero_ps();
+ __m128 dotProdVal1 = _mm_setzero_ps();
+ __m128 dotProdVal2 = _mm_setzero_ps();
+ __m128 dotProdVal3 = _mm_setzero_ps();
for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- bVal = _mm_load_ps(bPtr);
-
- cVal = _mm_mul_ps(aVal, bVal);
-
- dotProdVal = _mm_add_ps(cVal, dotProdVal);
-
- aPtr += 4;
- bPtr += 4;
+ a0Val = _mm_load_ps(aPtr);
+ a1Val = _mm_load_ps(aPtr+4);
+ a2Val = _mm_load_ps(aPtr+8);
+ a3Val = _mm_load_ps(aPtr+12);
+ b0Val = _mm_load_ps(bPtr);
+ b1Val = _mm_load_ps(bPtr+4);
+ b2Val = _mm_load_ps(bPtr+8);
+ b3Val = _mm_load_ps(bPtr+12);
+
+ c0Val = _mm_mul_ps(a0Val, b0Val);
+ c1Val = _mm_mul_ps(a1Val, b1Val);
+ c2Val = _mm_mul_ps(a2Val, b2Val);
+ c3Val = _mm_mul_ps(a3Val, b3Val);
+
+ dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+ dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+ dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+ dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+ aPtr += 4*4;
+ bPtr += 4*4;
}
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+ dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
__VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
- _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+ _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
dotProduct = dotProductVector[0];
dotProduct += dotProductVector[1];
@@ -66,6 +87,9 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float*
number = quarterPoints * 4;
for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
+ dotProduct += ((*aPtr++) * (*bPtr++));
}
*result = dotProduct;