summaryrefslogtreecommitdiff
path: root/volk/include
diff options
context:
space:
mode:
authorJohnathan Corgan2012-07-19 05:48:12 -0700
committerJohnathan Corgan2012-07-19 05:48:12 -0700
commit2cb5f71bb54e6558d9435e4316eb1f0046bc8cfb (patch)
treed283e1b00374758e9d1015128f2255a33bf42b09 /volk/include
parent4e06f35f611aff2e1d4983327da54cf63e5b9ada (diff)
parent7c8347ca47b51ddaef03ab1804a3d37716870643 (diff)
downloadgnuradio-2cb5f71bb54e6558d9435e4316eb1f0046bc8cfb.tar.gz
gnuradio-2cb5f71bb54e6558d9435e4316eb1f0046bc8cfb.tar.bz2
gnuradio-2cb5f71bb54e6558d9435e4316eb1f0046bc8cfb.zip
Merge branch 'master' into wip/gr-blocks-master
Diffstat (limited to 'volk/include')
-rw-r--r--volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h4
-rw-r--r--volk/include/volk/volk_16u_byteswap_u.h63
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_u.h51
3 files changed, 89 insertions, 29 deletions
diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
index 940aa5de7..1f6554af8 100644
--- a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
+++ b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
@@ -37,7 +37,7 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, cons
#endif /*LV_HAVE_GENERIC*/
-#ifdef LV_HAVE_SSE
+#if LV_HAVE_SSE && LV_HAVE_MMX
static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
@@ -116,7 +116,7 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const
*result = *(lv_32fc_t*)(&res[0]);
}
-#endif /*LV_HAVE_SSE*/
+#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/
diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h
new file mode 100644
index 000000000..8ef627a62
--- /dev/null
+++ b/volk/include/volk/volk_16u_byteswap_u.h
@@ -0,0 +1,63 @@
+#ifndef INCLUDED_volk_16u_byteswap_u_H
+#define INCLUDED_volk_16u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+ uint16_t* inputPtr = intsToSwap;
+ __m128i input, left, right, output;
+
+ const unsigned int eighthPoints = num_points / 8;
+ for(;number < eighthPoints; number++){
+ // Load the 16t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+ // Do the two shifts
+ left = _mm_slli_epi16(input, 8);
+ right = _mm_srli_epi16(input, 8);
+ // Or the left and right halves together
+ output = _mm_or_si128(left, right);
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 8;
+ }
+
+ // Byteswap any remaining points:
+ number = eighthPoints*8;
+ for(; number < num_points; number++){
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int point;
+ uint16_t* inputPtr = intsToSwap;
+ for(point = 0; point < num_points; point++){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_16u_byteswap_u_H */
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
index ab33a2587..b24e8b1f7 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -48,14 +48,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
for(;number < sixteenthPoints; number++){
- a0Val = _mm_load_ps(aPtr);
- a1Val = _mm_load_ps(aPtr+4);
- a2Val = _mm_load_ps(aPtr+8);
- a3Val = _mm_load_ps(aPtr+12);
- b0Val = _mm_load_ps(bPtr);
- b1Val = _mm_load_ps(bPtr+4);
- b2Val = _mm_load_ps(bPtr+8);
- b3Val = _mm_load_ps(bPtr+12);
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
c0Val = _mm_mul_ps(a0Val, b0Val);
c1Val = _mm_mul_ps(a1Val, b1Val);
@@ -87,9 +87,6 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
number = sixteenthPoints*16;
for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
}
*result = dotProduct;
@@ -121,14 +118,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float *
for(;number < sixteenthPoints; number++){
- a0Val = _mm_load_ps(aPtr);
- a1Val = _mm_load_ps(aPtr+4);
- a2Val = _mm_load_ps(aPtr+8);
- a3Val = _mm_load_ps(aPtr+12);
- b0Val = _mm_load_ps(bPtr);
- b1Val = _mm_load_ps(bPtr+4);
- b2Val = _mm_load_ps(bPtr+8);
- b3Val = _mm_load_ps(bPtr+12);
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
c0Val = _mm_mul_ps(a0Val, b0Val);
c1Val = _mm_mul_ps(a1Val, b1Val);
@@ -187,15 +184,15 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
for(;number < sixteenthPoints; number++){
- aVal1 = _mm_load_ps(aPtr); aPtr += 4;
- aVal2 = _mm_load_ps(aPtr); aPtr += 4;
- aVal3 = _mm_load_ps(aPtr); aPtr += 4;
- aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
- bVal1 = _mm_load_ps(bPtr); bPtr += 4;
- bVal2 = _mm_load_ps(bPtr); bPtr += 4;
- bVal3 = _mm_load_ps(bPtr); bPtr += 4;
- bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);