diff options
Diffstat (limited to 'volk')
-rw-r--r-- | volk/include/volk/volk_32f_x2_dot_prod_32f_a.h | 48 |
1 files changed, 36 insertions, 12 deletions
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h index 448b2fdc0..8753ff615 100644 --- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h @@ -37,26 +37,47 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* const float* aPtr = input; const float* bPtr = taps; - __m128 aVal, bVal, cVal; + __m128 a0Val, a1Val, a2Val, a3Val; + __m128 b0Val, b1Val, b2Val, b3Val; + __m128 c0Val, c1Val, c2Val, c3Val; - __m128 dotProdVal = _mm_setzero_ps(); + __m128 dotProdVal0 = _mm_setzero_ps(); + __m128 dotProdVal1 = _mm_setzero_ps(); + __m128 dotProdVal2 = _mm_setzero_ps(); + __m128 dotProdVal3 = _mm_setzero_ps(); for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; + a0Val = _mm_load_ps(aPtr); + a1Val = _mm_load_ps(aPtr+4); + a2Val = _mm_load_ps(aPtr+8); + a3Val = _mm_load_ps(aPtr+12); + b0Val = _mm_load_ps(bPtr); + b1Val = _mm_load_ps(bPtr+4); + b2Val = _mm_load_ps(bPtr+8); + b3Val = _mm_load_ps(bPtr+12); + + c0Val = _mm_mul_ps(a0Val, b0Val); + c1Val = _mm_mul_ps(a1Val, b1Val); + c2Val = _mm_mul_ps(a2Val, b2Val); + c3Val = _mm_mul_ps(a3Val, b3Val); + + dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0); + dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1); + dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2); + dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3); + + aPtr += 4*4; + bPtr += 4*4; } + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2); + dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3); + __VOLK_ATTR_ALIGNED(16) float dotProductVector[4]; - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector dotProduct = dotProductVector[0]; dotProduct += dotProductVector[1]; @@ -66,6 +87,9 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const float* number = quarterPoints * 4; for(;number < num_points; number++){ dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); + dotProduct += ((*aPtr++) * (*bPtr++)); } *result = dotProduct; |