Merge branch 'master' into wip/gr-blocks-master

author: Johnathan Corgan 2012-07-19 05:48:12 -0700
committer: Johnathan Corgan 2012-07-19 05:48:12 -0700
commit: 2cb5f71bb54e6558d9435e4316eb1f0046bc8cfb (patch)
tree: d283e1b00374758e9d1015128f2255a33bf42b09 /volk/include
parent: 4e06f35f611aff2e1d4983327da54cf63e5b9ada (diff)
parent: 7c8347ca47b51ddaef03ab1804a3d37716870643 (diff)
download: gnuradio-2cb5f71bb54e6558d9435e4316eb1f0046bc8cfb.tar.gz
gnuradio-2cb5f71bb54e6558d9435e4316eb1f0046bc8cfb.tar.bz2
gnuradio-2cb5f71bb54e6558d9435e4316eb1f0046bc8cfb.zip
3 files changed, 89 insertions, 29 deletions
diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
index 940aa5de7..1f6554af8 100644
--- a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
+++ b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
@@ -37,7 +37,7 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, cons
 #endif /*LV_HAVE_GENERIC*/
 
 
-#ifdef LV_HAVE_SSE
+#if LV_HAVE_SSE && LV_HAVE_MMX
 
 
 static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const  short* input, const  lv_32fc_t* taps, unsigned int num_points) {
@@ -116,7 +116,7 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const
   *result = *(lv_32fc_t*)(&res[0]);
 }
 
-#endif /*LV_HAVE_SSE*/
+#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
 
 
 #endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/
diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h
new file mode 100644
index 000000000..8ef627a62
--- /dev/null
+++ b/volk/include/volk/volk_16u_byteswap_u.h
@@ -0,0 +1,63 @@
+#ifndef INCLUDED_volk_16u_byteswap_u_H
+#define INCLUDED_volk_16u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+  \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
+  unsigned int number = 0;
+  uint16_t* inputPtr = intsToSwap;
+  __m128i input, left, right, output;
+
+  const unsigned int eighthPoints = num_points / 8;
+  for(;number < eighthPoints; number++){
+    // Load the 16t values, increment inputPtr later since we're doing it in-place.
+    input = _mm_loadu_si128((__m128i*)inputPtr);
+    // Do the two shifts
+    left = _mm_slli_epi16(input, 8);
+    right = _mm_srli_epi16(input, 8);
+    // Or the left and right halves together
+    output = _mm_or_si128(left, right);
+    // Store the results
+    _mm_storeu_si128((__m128i*)inputPtr, output);
+    inputPtr += 8;
+  }
+
+  // Byteswap any remaining points:
+  number = eighthPoints*8;
+  for(; number < num_points; number++){
+    uint16_t outputVal = *inputPtr;
+    outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+    *inputPtr = outputVal;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned int num_points){
+  unsigned int point;
+  uint16_t* inputPtr = intsToSwap;
+  for(point = 0; point < num_points; point++){
+    uint16_t output = *inputPtr;
+    output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+    *inputPtr = output;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_16u_byteswap_u_H */
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
index ab33a2587..b24e8b1f7 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -48,14 +48,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float*
 
   for(;number < sixteenthPoints; number++){
 
-    a0Val = _mm_load_ps(aPtr);
-    a1Val = _mm_load_ps(aPtr+4);
-    a2Val = _mm_load_ps(aPtr+8);
-    a3Val = _mm_load_ps(aPtr+12);
-    b0Val = _mm_load_ps(bPtr);
-    b1Val = _mm_load_ps(bPtr+4);
-    b2Val = _mm_load_ps(bPtr+8);
-    b3Val = _mm_load_ps(bPtr+12);
+    a0Val = _mm_loadu_ps(aPtr);
+    a1Val = _mm_loadu_ps(aPtr+4);
+    a2Val = _mm_loadu_ps(aPtr+8);
+    a3Val = _mm_loadu_ps(aPtr+12);
+    b0Val = _mm_loadu_ps(bPtr);
+    b1Val = _mm_loadu_ps(bPtr+4);
+    b2Val = _mm_loadu_ps(bPtr+8);
+    b3Val = _mm_loadu_ps(bPtr+12);
 
     c0Val = _mm_mul_ps(a0Val, b0Val);
     c1Val = _mm_mul_ps(a1Val, b1Val);
@@ -87,9 +87,6 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float*
   number = sixteenthPoints*16;
   for(;number < num_points; number++){
     dotProduct += ((*aPtr++) * (*bPtr++));
-    dotProduct += ((*aPtr++) * (*bPtr++));
-    dotProduct += ((*aPtr++) * (*bPtr++));
-    dotProduct += ((*aPtr++) * (*bPtr++));
   }
 
   *result = dotProduct;
@@ -121,14 +118,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float *
 
   for(;number < sixteenthPoints; number++){
 
-    a0Val = _mm_load_ps(aPtr);
-    a1Val = _mm_load_ps(aPtr+4);
-    a2Val = _mm_load_ps(aPtr+8);
-    a3Val = _mm_load_ps(aPtr+12);
-    b0Val = _mm_load_ps(bPtr);
-    b1Val = _mm_load_ps(bPtr+4);
-    b2Val = _mm_load_ps(bPtr+8);
-    b3Val = _mm_load_ps(bPtr+12);
+    a0Val = _mm_loadu_ps(aPtr);
+    a1Val = _mm_loadu_ps(aPtr+4);
+    a2Val = _mm_loadu_ps(aPtr+8);
+    a3Val = _mm_loadu_ps(aPtr+12);
+    b0Val = _mm_loadu_ps(bPtr);
+    b1Val = _mm_loadu_ps(bPtr+4);
+    b2Val = _mm_loadu_ps(bPtr+8);
+    b3Val = _mm_loadu_ps(bPtr+12);
 
     c0Val = _mm_mul_ps(a0Val, b0Val);
     c1Val = _mm_mul_ps(a1Val, b1Val);
@@ -187,15 +184,15 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
 
   for(;number < sixteenthPoints; number++){
 
-    aVal1 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal2 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal3 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
 
-    bVal1 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal2 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal3 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
 
     cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
     cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
author	Johnathan Corgan	2012-07-19 05:48:12 -0700
committer	Johnathan Corgan	2012-07-19 05:48:12 -0700
commit	2cb5f71bb54e6558d9435e4316eb1f0046bc8cfb (patch)
tree	d283e1b00374758e9d1015128f2255a33bf42b09 /volk/include
parent	4e06f35f611aff2e1d4983327da54cf63e5b9ada (diff)
parent	7c8347ca47b51ddaef03ab1804a3d37716870643 (diff)
download	gnuradio-2cb5f71bb54e6558d9435e4316eb1f0046bc8cfb.tar.gz gnuradio-2cb5f71bb54e6558d9435e4316eb1f0046bc8cfb.tar.bz2 gnuradio-2cb5f71bb54e6558d9435e4316eb1f0046bc8cfb.zip