summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Rondeau2012-07-08 16:35:02 -0400
committerTom Rondeau2012-07-08 16:35:02 -0400
commit8a82bd5dc56a914dd65b88e63b056711fe28cec0 (patch)
treed5f52194502b45cf04019db6db5a00c71e77fe8f
parent3a142bebafdc018bccc80cf124a375b53db03581 (diff)
parent61dd97409abde2412595fa762c0c4161863604f4 (diff)
downloadgnuradio-8a82bd5dc56a914dd65b88e63b056711fe28cec0.tar.gz
gnuradio-8a82bd5dc56a914dd65b88e63b056711fe28cec0.tar.bz2
gnuradio-8a82bd5dc56a914dd65b88e63b056711fe28cec0.zip
Merge remote-tracking branch 'jblum/fix_volk_32f_x2_dot_prod_32f'
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_u.h51
-rw-r--r--volk/lib/testqa.cc2
2 files changed, 25 insertions, 28 deletions
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
index ab33a2587..b24e8b1f7 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -48,14 +48,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
for(;number < sixteenthPoints; number++){
- a0Val = _mm_load_ps(aPtr);
- a1Val = _mm_load_ps(aPtr+4);
- a2Val = _mm_load_ps(aPtr+8);
- a3Val = _mm_load_ps(aPtr+12);
- b0Val = _mm_load_ps(bPtr);
- b1Val = _mm_load_ps(bPtr+4);
- b2Val = _mm_load_ps(bPtr+8);
- b3Val = _mm_load_ps(bPtr+12);
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
c0Val = _mm_mul_ps(a0Val, b0Val);
c1Val = _mm_mul_ps(a1Val, b1Val);
@@ -87,9 +87,6 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
number = sixteenthPoints*16;
for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
}
*result = dotProduct;
@@ -121,14 +118,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float *
for(;number < sixteenthPoints; number++){
- a0Val = _mm_load_ps(aPtr);
- a1Val = _mm_load_ps(aPtr+4);
- a2Val = _mm_load_ps(aPtr+8);
- a3Val = _mm_load_ps(aPtr+12);
- b0Val = _mm_load_ps(bPtr);
- b1Val = _mm_load_ps(bPtr+4);
- b2Val = _mm_load_ps(bPtr+8);
- b3Val = _mm_load_ps(bPtr+12);
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
c0Val = _mm_mul_ps(a0Val, b0Val);
c1Val = _mm_mul_ps(a1Val, b1Val);
@@ -187,15 +184,15 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
for(;number < sixteenthPoints; number++){
- aVal1 = _mm_load_ps(aPtr); aPtr += 4;
- aVal2 = _mm_load_ps(aPtr); aPtr += 4;
- aVal3 = _mm_load_ps(aPtr); aPtr += 4;
- aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
- bVal1 = _mm_load_ps(bPtr); bPtr += 4;
- bVal2 = _mm_load_ps(bPtr); bPtr += 4;
- bVal3 = _mm_load_ps(bPtr); bPtr += 4;
- bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index d1eb1cacb..507c78777 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -54,7 +54,7 @@ VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1);
-//VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1);
VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 1);
//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000);
VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1);