14 files changed, 858 insertions, 103 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 0b4442108..648f4d878 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -33,6 +33,8 @@ int main(int argc, char *argv[]) {
     //VOLK_PROFILE(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 10000, &results);
     //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 10000, &results);
     VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_16u_byteswap_u, 0, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results);
     VOLK_PROFILE(volk_32f_accumulator_s32f_a, 1e-4, 0, 204600, 10000, &results);
     VOLK_PROFILE(volk_32f_x2_add_32f_a, 1e-4, 0, 204600, 10000, &results);
     VOLK_PROFILE(volk_32f_x2_add_32f_u, 1e-4, 0, 204600, 10000, &results);
@@ -49,6 +51,7 @@ int main(int argc, char *argv[]) {
     VOLK_PROFILE(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 204600, 5000, &results);
     VOLK_PROFILE(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 204600, 1000, &results);
     VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results);
+    VOLK_PROFILE(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results);
     VOLK_PROFILE(volk_32fc_index_max_16u_a, 3, 0, 204600, 10000, &results);
     VOLK_PROFILE(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 204600, 100, &results);
     VOLK_PROFILE(volk_32fc_magnitude_32f_a, 1e-4, 0, 204600, 1000, &results);
@@ -76,6 +79,7 @@ int main(int argc, char *argv[]) {
     VOLK_PROFILE(volk_32f_x2_divide_32f_a, 1e-4, 0, 204600, 2000, &results);
     VOLK_PROFILE(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 5000, &results);
     VOLK_PROFILE(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 5000, &results);
+    VOLK_PROFILE(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 5000, &results);
     //VOLK_PROFILE(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000, &results);
     VOLK_PROFILE(volk_32f_index_max_16u_a, 3, 0, 204600, 5000, &results);
     VOLK_PROFILE(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 204600, 3000, &results);
diff --git a/volk/gen/machines.xml b/volk/gen/machines.xml
index 8e3c9c8c2..d88a1a50c 100644
--- a/volk/gen/machines.xml
+++ b/volk/gen/machines.xml
@@ -10,7 +10,7 @@
 </machine>
 
 <machine name="sse">
-<archs>generic 32|64| mmx sse orc|</archs>
+<archs>generic 32|64| mmx| sse orc|</archs>
 </machine>
 -->
 
@@ -20,7 +20,7 @@
 
 <!-- trailing | bar means generate without either for MSVC -->
 <machine name="sse2">
-<archs>generic 32|64| mmx sse sse2 orc|</archs>
+<archs>generic 32|64| mmx| sse sse2 orc|</archs>
 </machine>
 
 <machine name="sse3">
@@ -45,7 +45,7 @@
 
 <!-- trailing | bar means generate without either for MSVC -->
 <machine name="avx">
-<archs>generic 32|64| mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc|</archs>
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc|</archs>
 </machine>
 
 <machine name="altivec">
diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
new file mode 100644
index 000000000..1f6554af8
--- /dev/null
+++ b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+#define INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, const short* input, const lv_32fc_t * taps, unsigned int num_points) {
+
+  static const int N_UNROLL = 4;
+
+  lv_32fc_t acc0 = 0;
+  lv_32fc_t acc1 = 0;
+  lv_32fc_t acc2 = 0;
+  lv_32fc_t acc3 = 0;
+
+  unsigned i = 0;
+  unsigned n = (num_points / N_UNROLL) * N_UNROLL;
+
+  for(i = 0; i < n; i += N_UNROLL) {
+    acc0 += taps[i + 0] * (float)input[i + 0];
+    acc1 += taps[i + 1] * (float)input[i + 1];
+    acc2 += taps[i + 2] * (float)input[i + 2];
+    acc3 += taps[i + 3] * (float)input[i + 3];
+  }
+
+  for(; i < num_points; i++) {
+    acc0 += taps[i] * (float)input[i];
+  }
+
+  *result = acc0 + acc1 + acc2 + acc3;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_MMX
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const  short* input, const  lv_32fc_t* taps, unsigned int num_points) {
+
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 8;
+
+  float res[2];
+  float *realpt = &res[0], *imagpt = &res[1];
+  const short* aPtr = input;
+  const float* bPtr = (float*)taps;
+
+  __m64  m0, m1;
+  __m128 f0, f1, f2, f3;
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
+
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
+    m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
+    f0 = _mm_cvtpi16_ps(m0);
+    f1 = _mm_cvtpi16_ps(m0);
+    f2 = _mm_cvtpi16_ps(m1);
+    f3 = _mm_cvtpi16_ps(m1);
+
+    a0Val = _mm_unpacklo_ps(f0, f1);
+    a1Val = _mm_unpackhi_ps(f0, f1);
+    a2Val = _mm_unpacklo_ps(f2, f3);
+    a3Val = _mm_unpackhi_ps(f2, f3);
+
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+    aPtr += 8;
+    bPtr += 16;
+  }
+
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  *realpt = dotProductVector[0];
+  *imagpt = dotProductVector[1];
+  *realpt += dotProductVector[2];
+  *imagpt += dotProductVector[3];
+
+  number = sixteenthPoints*8;
+  for(;number < num_points; number++){
+    *realpt += ((*aPtr)   * (*bPtr++));
+    *imagpt += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
+
+
+#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/
diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h
new file mode 100644
index 000000000..8ef627a62
--- /dev/null
+++ b/volk/include/volk/volk_16u_byteswap_u.h
@@ -0,0 +1,63 @@
+#ifndef INCLUDED_volk_16u_byteswap_u_H
+#define INCLUDED_volk_16u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+  \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
+  unsigned int number = 0;
+  uint16_t* inputPtr = intsToSwap;
+  __m128i input, left, right, output;
+
+  const unsigned int eighthPoints = num_points / 8;
+  for(;number < eighthPoints; number++){
+    // Load the 16t values, increment inputPtr later since we're doing it in-place.
+    input = _mm_loadu_si128((__m128i*)inputPtr);
+    // Do the two shifts
+    left = _mm_slli_epi16(input, 8);
+    right = _mm_srli_epi16(input, 8);
+    // Or the left and right halves together
+    output = _mm_or_si128(left, right);
+    // Store the results
+    _mm_storeu_si128((__m128i*)inputPtr, output);
+    inputPtr += 8;
+  }
+
+  // Byteswap any remaining points:
+  number = eighthPoints*8;
+  for(; number < num_points; number++){
+    uint16_t outputVal = *inputPtr;
+    outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+    *inputPtr = outputVal;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned int num_points){
+  unsigned int point;
+  uint16_t* inputPtr = intsToSwap;
+  for(point = 0; point < num_points; point++){
+    uint16_t output = *inputPtr;
+    output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+    *inputPtr = output;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_16u_byteswap_u_H */
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h
new file mode 100644
index 000000000..961c2418c
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_dot_prod_16i_a.h
@@ -0,0 +1,98 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+#define INCLUDED_volk_32f_x2_dot_prod_16i_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_16i_a_generic(int16_t* result, const float* input, const float* taps, unsigned int num_points) {
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = (int16_t)dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result, const  float* input, const  float* taps, unsigned int num_points) {
+
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
+
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+    aPtr += 16;
+    bPtr += 16;
+  }
+
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints*16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = (short)dotProduct;
+
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
index 448b2fdc0..067c33ad8 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a.h
@@ -31,39 +31,60 @@ static inline void volk_32f_x2_dot_prod_32f_a_generic(float * result, const floa
 static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
 
   unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+  const unsigned int sixteenthPoints = num_points / 16;
 
   float dotProduct = 0;
   const float* aPtr = input;
   const float* bPtr = taps;
 
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
 
-    aVal = _mm_load_ps(aPtr);
-    bVal = _mm_load_ps(bPtr);
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
 
-    cVal = _mm_mul_ps(aVal, bVal);
-
-    dotProdVal = _mm_add_ps(cVal, dotProdVal);
+  for(;number < sixteenthPoints; number++){
 
-    aPtr += 4;
-    bPtr += 4;
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+    aPtr += 16;
+    bPtr += 16;
   }
 
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
 
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
 
   dotProduct = dotProductVector[0];
   dotProduct += dotProductVector[1];
   dotProduct += dotProductVector[2];
   dotProduct += dotProductVector[3];
 
-  number = quarterPoints * 4;
+  number = sixteenthPoints*16;
   for(;number < num_points; number++){
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
@@ -80,38 +101,59 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse( float* result, const  float*
 
 static inline void volk_32f_x2_dot_prod_32f_a_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
   unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+  const unsigned int sixteenthPoints = num_points / 16;
 
   float dotProduct = 0;
   const float* aPtr = input;
   const float* bPtr = taps;
 
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
 
-    aVal = _mm_load_ps(aPtr);
-    bVal = _mm_load_ps(bPtr);
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
 
-    cVal = _mm_mul_ps(aVal, bVal);
-
-    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+  for(;number < sixteenthPoints; number++){
 
-    aPtr += 4;
-    bPtr += 4;
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+    b0Val = _mm_load_ps(bPtr);
+    b1Val = _mm_load_ps(bPtr+4);
+    b2Val = _mm_load_ps(bPtr+8);
+    b3Val = _mm_load_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+    dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+    dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+    dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+    aPtr += 16;
+    bPtr += 16;
   }
 
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
 
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
 
   dotProduct = dotProductVector[0];
   dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
 
-  number = quarterPoints * 4;
+  number = sixteenthPoints*16;
   for(;number < num_points; number++){
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
@@ -182,4 +224,67 @@ static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float * result, const float
 
 #endif /*LV_HAVE_SSE4_1*/
 
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_avx( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m256 a0Val, a1Val;
+  __m256 b0Val, b1Val;
+  __m256 c0Val, c1Val;
+
+  __m256 dotProdVal0 = _mm256_setzero_ps();
+  __m256 dotProdVal1 = _mm256_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    a0Val = _mm256_load_ps(aPtr);
+    a1Val = _mm256_load_ps(aPtr+8);
+    b0Val = _mm256_load_ps(bPtr);
+    b1Val = _mm256_load_ps(bPtr+8);
+
+    c0Val = _mm256_mul_ps(a0Val, b0Val);
+    c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+    dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+    aPtr += 16;
+    bPtr += 16;
+  }
+
+  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+  _mm256_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+  dotProduct += dotProductVector[4];
+  dotProduct += dotProductVector[5];
+  dotProduct += dotProductVector[6];
+  dotProduct += dotProductVector[7];
+
+  number = sixteenthPoints*16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
index 3b7284b57..b24e8b1f7 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -1,6 +1,7 @@
 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
 
+#include <volk/volk_common.h>
 #include<stdio.h>
 
 
@@ -30,39 +31,60 @@ static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const floa
 static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
 
   unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+  const unsigned int sixteenthPoints = num_points / 16;
 
   float dotProduct = 0;
   const float* aPtr = input;
   const float* bPtr = taps;
 
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
 
-    aVal = _mm_loadu_ps(aPtr);
-    bVal = _mm_loadu_ps(bPtr);
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
 
-    cVal = _mm_mul_ps(aVal, bVal);
-
-    dotProdVal = _mm_add_ps(cVal, dotProdVal);
+  for(;number < sixteenthPoints; number++){
 
-    aPtr += 4;
-    bPtr += 4;
+    a0Val = _mm_loadu_ps(aPtr);
+    a1Val = _mm_loadu_ps(aPtr+4);
+    a2Val = _mm_loadu_ps(aPtr+8);
+    a3Val = _mm_loadu_ps(aPtr+12);
+    b0Val = _mm_loadu_ps(bPtr);
+    b1Val = _mm_loadu_ps(bPtr+4);
+    b2Val = _mm_loadu_ps(bPtr+8);
+    b3Val = _mm_loadu_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+    aPtr += 16;
+    bPtr += 16;
   }
 
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
   __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
 
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
 
   dotProduct = dotProductVector[0];
   dotProduct += dotProductVector[1];
   dotProduct += dotProductVector[2];
   dotProduct += dotProductVector[3];
 
-  number = quarterPoints * 4;
+  number = sixteenthPoints*16;
   for(;number < num_points; number++){
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
@@ -79,38 +101,59 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float*
 
 static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
   unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
+  const unsigned int sixteenthPoints = num_points / 16;
 
   float dotProduct = 0;
   const float* aPtr = input;
   const float* bPtr = taps;
 
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
 
-    aVal = _mm_loadu_ps(aPtr);
-    bVal = _mm_loadu_ps(bPtr);
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
 
-    cVal = _mm_mul_ps(aVal, bVal);
-
-    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+  for(;number < sixteenthPoints; number++){
 
-    aPtr += 4;
-    bPtr += 4;
+    a0Val = _mm_loadu_ps(aPtr);
+    a1Val = _mm_loadu_ps(aPtr+4);
+    a2Val = _mm_loadu_ps(aPtr+8);
+    a3Val = _mm_loadu_ps(aPtr+12);
+    b0Val = _mm_loadu_ps(bPtr);
+    b1Val = _mm_loadu_ps(bPtr+4);
+    b2Val = _mm_loadu_ps(bPtr+8);
+    b3Val = _mm_loadu_ps(bPtr+12);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+    dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+    dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+    dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+    aPtr += 16;
+    bPtr += 16;
   }
 
-  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
-  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
 
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
 
   dotProduct = dotProductVector[0];
   dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
 
-  number = quarterPoints * 4;
+  number = sixteenthPoints*16;
   for(;number < num_points; number++){
     dotProduct += ((*aPtr++) * (*bPtr++));
   }
@@ -181,4 +224,67 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
 
 #endif /*LV_HAVE_SSE4_1*/
 
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_avx( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m256 a0Val, a1Val;
+  __m256 b0Val, b1Val;
+  __m256 c0Val, c1Val;
+
+  __m256 dotProdVal0 = _mm256_setzero_ps();
+  __m256 dotProdVal1 = _mm256_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    a0Val = _mm256_loadu_ps(aPtr);
+    a1Val = _mm256_loadu_ps(aPtr+8);
+    b0Val = _mm256_loadu_ps(bPtr);
+    b1Val = _mm256_loadu_ps(bPtr+8);
+
+    c0Val = _mm256_mul_ps(a0Val, b0Val);
+    c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+    dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+    aPtr += 16;
+    bPtr += 16;
+  }
+
+  dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+  _mm256_storeu_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+  dotProduct += dotProductVector[4];
+  dotProduct += dotProductVector[5];
+  dotProduct += dotProductVector[6];
+  dotProduct += dotProductVector[7];
+
+  number = sixteenthPoints*16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+
+}
+
+#endif /*LV_HAVE_AVX*/
+
 #endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
diff --git a/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h
new file mode 100644
index 000000000..109b787e8
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32f_dot_prod_32fc_a.h
@@ -0,0 +1,111 @@
+#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+
+#include <volk/volk_common.h>
+#include<stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const float * taps, unsigned int num_points) {
+
+  float res[2];
+  float *realpt = &res[0], *imagpt = &res[1];
+  const float* aPtr = (float*)input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  *realpt = 0;
+  *imagpt = 0;
+
+  for(number = 0; number < num_points; number++){
+    *realpt += ((*aPtr++) * (*bPtr));
+    *imagpt += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_sse( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points) {
+
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 8;
+
+  float res[2];
+  float *realpt = &res[0], *imagpt = &res[1];
+  const float* aPtr = (float*)input;
+  const float* bPtr = taps;
+
+  __m128 a0Val, a1Val, a2Val, a3Val;
+  __m128 b0Val, b1Val, b2Val, b3Val;
+  __m128 x0Val, x1Val, x2Val, x3Val;
+  __m128 c0Val, c1Val, c2Val, c3Val;
+
+  __m128 dotProdVal0 = _mm_setzero_ps();
+  __m128 dotProdVal1 = _mm_setzero_ps();
+  __m128 dotProdVal2 = _mm_setzero_ps();
+  __m128 dotProdVal3 = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+
+    a0Val = _mm_load_ps(aPtr);
+    a1Val = _mm_load_ps(aPtr+4);
+    a2Val = _mm_load_ps(aPtr+8);
+    a3Val = _mm_load_ps(aPtr+12);
+
+    x0Val = _mm_load_ps(bPtr);
+    x1Val = _mm_load_ps(bPtr);
+    x2Val = _mm_load_ps(bPtr+4);
+    x3Val = _mm_load_ps(bPtr+4);
+    b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+    b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+    b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+    b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+
+    c0Val = _mm_mul_ps(a0Val, b0Val);
+    c1Val = _mm_mul_ps(a1Val, b1Val);
+    c2Val = _mm_mul_ps(a2Val, b2Val);
+    c3Val = _mm_mul_ps(a3Val, b3Val);
+
+    dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+    dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+    dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+    dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+    aPtr += 16;
+    bPtr += 8;
+  }
+
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+  dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+  __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+  _mm_store_ps(dotProductVector,dotProdVal0); // Store the results back into the dot product vector
+
+  *realpt = dotProductVector[0];
+  *imagpt = dotProductVector[1];
+  *realpt += dotProductVector[2];
+  *imagpt += dotProductVector[3];
+
+  number = sixteenthPoints*8;
+  for(;number < num_points; number++){
+    *realpt += ((*aPtr++) * (*bPtr));
+    *imagpt += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H*/
diff --git a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
index 08db99373..51b6041ec 100644
--- a/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
+++ b/volk/include/volk/volk_32fc_s32fc_x2_rotator_32fc_a.h
@@ -171,12 +171,10 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, c
     printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
     printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
     printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
-    __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p, negated, zeros;
+    __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
     
     phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
     inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
-    zeros = _mm256_set1_ps(0.0);
-    negated = _mm256_set1_ps(-1.0);
     const unsigned int fourthPoints = num_points / 4;
 
     
diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
index cb2ac4c67..caef3e6f0 100644
--- a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
+++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a.h
@@ -18,40 +18,26 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const
   unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
   unsigned int isodd = (num_bytes >> 3) &1;
 
-
-
   float sum0[2] = {0,0};
   float sum1[2] = {0,0};
   unsigned int i = 0;
 
-
   for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-
-
     sum0[0] += in[0] * tp[0] - in[1] * tp[1];
     sum0[1] += in[0] * tp[1] + in[1] * tp[0];
     sum1[0] += in[2] * tp[2] - in[3] * tp[3];
     sum1[1] += in[2] * tp[3] + in[3] * tp[2];
 
-
     in += 4;
     tp += 4;
-
   }
 
-
   res[0] = sum0[0] + sum1[0];
   res[1] = sum0[1] + sum1[1];
 
-
-
   for(i = 0; i < isodd; ++i) {
-
-
     *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1];
-
   }
-
 }
 
 #endif /*LV_HAVE_GENERIC*/
@@ -177,14 +163,8 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const
      );
 
 
-  int getem = num_bytes % 16;
-
-
-  for(; getem > 0; getem -= 8) {
-
-
+  if(((num_bytes >> 3) & 1)) {
     *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
-
   }
 
   return;
@@ -363,7 +343,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv
 
   dotProduct += ( dotProductVector[0] + dotProductVector[1] );
 
-  if((num_bytes >> 2) != 0) {
+  if(((num_bytes >> 3) & 1) != 0) {
     dotProduct += (*a) * (*b);
   }
 
@@ -377,9 +357,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv
 #include <smmintrin.h>
 
 static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  volk_32fc_x2_dot_prod_32fc_a_sse3(result, input, taps, num_bytes);
-  // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now
-   /*
+
     __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
     float *p_input, *p_taps;
     __m64 *p_result;
@@ -442,11 +420,7 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
 
     }
 
-
-
-
-    real1 = _mm_xor_ps(real1, (__m128)neg);
-
+    real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
 
     im0 = _mm_add_ps(im0, im1);
     real0 = _mm_add_ps(real0, real1);
@@ -459,7 +433,6 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const
 
     *result += input[i] * taps[i];
     }
-  */
 }
 
 #endif /*LV_HAVE_SSE4_1*/
diff --git a/volk/include/volk/volk_32u_byteswap_u.h b/volk/include/volk/volk_32u_byteswap_u.h
new file mode 100644
index 000000000..e27d1f03d
--- /dev/null
+++ b/volk/include/volk/volk_32u_byteswap_u.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_32u_byteswap_u_H
+#define INCLUDED_volk_32u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int32_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points){
+  unsigned int number = 0;
+
+  uint32_t* inputPtr = intsToSwap;
+  __m128i input, byte1, byte2, byte3, byte4, output;
+  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+  const uint64_t quarterPoints = num_points / 4;
+  for(;number < quarterPoints; number++){
+    // Load the 32t values, increment inputPtr later since we're doing it in-place.
+    input = _mm_loadu_si128((__m128i*)inputPtr);
+    // Do the four shifts
+    byte1 = _mm_slli_epi32(input, 24);
+    byte2 = _mm_slli_epi32(input, 8);
+    byte3 = _mm_srli_epi32(input, 8);
+    byte4 = _mm_srli_epi32(input, 24);
+    // Or bytes together
+    output = _mm_or_si128(byte1, byte4);
+    byte2 = _mm_and_si128(byte2, byte2mask);
+    output = _mm_or_si128(output, byte2);
+    byte3 = _mm_and_si128(byte3, byte3mask);
+    output = _mm_or_si128(output, byte3);
+    // Store the results
+    _mm_storeu_si128((__m128i*)inputPtr, output);
+    inputPtr += 4;
+  }
+
+  // Byteswap any remaining points:
+  number = quarterPoints*4;
+  for(; number < num_points; number++){
+    uint32_t outputVal = *inputPtr;
+    outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+    *inputPtr = outputVal;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int32_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_u_generic(uint32_t* intsToSwap, unsigned int num_points){
+  uint32_t* inputPtr = intsToSwap;
+
+  unsigned int point;
+  for(point = 0; point < num_points; point++){
+    uint32_t output = *inputPtr;
+    output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+    *inputPtr = output;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32u_byteswap_u_H */
diff --git a/volk/include/volk/volk_64u_byteswap_u.h b/volk/include/volk/volk_64u_byteswap_u.h
new file mode 100644
index 000000000..41a4a3130
--- /dev/null
+++ b/volk/include/volk/volk_64u_byteswap_u.h
@@ -0,0 +1,88 @@
+#ifndef INCLUDED_volk_64u_byteswap_u_H
+#define INCLUDED_volk_64u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int64_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points){
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    __m128i input, byte1, byte2, byte3, byte4, output;
+    __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+    __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+    uint64_t number = 0;
+    const unsigned int halfPoints = num_points / 2;
+    for(;number < halfPoints; number++){
+      // Load the 32t values, increment inputPtr later since we're doing it in-place.
+      input = _mm_loadu_si128((__m128i*)inputPtr);
+
+      // Do the four shifts
+      byte1 = _mm_slli_epi32(input, 24);
+      byte2 = _mm_slli_epi32(input, 8);
+      byte3 = _mm_srli_epi32(input, 8);
+      byte4 = _mm_srli_epi32(input, 24);
+      // Or bytes together
+      output = _mm_or_si128(byte1, byte4);
+      byte2 = _mm_and_si128(byte2, byte2mask);
+      output = _mm_or_si128(output, byte2);
+      byte3 = _mm_and_si128(byte3, byte3mask);
+      output = _mm_or_si128(output, byte3);
+
+      // Reorder the two words
+      output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+      // Store the results
+      _mm_storeu_si128((__m128i*)inputPtr, output);
+      inputPtr += 4;
+    }
+
+    // Byteswap any remaining points:
+    number = halfPoints*2;
+    for(; number < num_points; number++){
+      uint32_t output1 = *inputPtr;
+      uint32_t output2 = inputPtr[1];
+
+      output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+      output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+      *inputPtr++ = output2;
+      *inputPtr++ = output1;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int64_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_u_generic(uint64_t* intsToSwap, unsigned int num_points){
+  uint32_t* inputPtr = (uint32_t*)intsToSwap;
+  unsigned int point;
+  for(point = 0; point < num_points; point++){
+    uint32_t output1 = *inputPtr;
+    uint32_t output2 = inputPtr[1];
+
+    output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+    output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+    *inputPtr++ = output2;
+    *inputPtr++ = output1;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64u_byteswap_u_H */
diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt
index 328a4e087..d9aeb797c 100644
--- a/volk/lib/CMakeLists.txt
+++ b/volk/lib/CMakeLists.txt
@@ -150,6 +150,12 @@ if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86)
     if (${SIZEOF_CPU} EQUAL 32)
         OVERRULE_ARCH(64 "CPU width is 32 bits")
     endif()
+
+    #MSVC 64 bit does not have MMX, overrule it
+    if (${SIZEOF_CPU} EQUAL 64 AND MSVC)
+        OVERRULE_ARCH(mmx "No MMX for Win64")
+    endif()
+
 endif()
 
 ########################################################################
@@ -174,7 +180,7 @@ execute_process(
 # When this occurs, eliminate the redundant machines
 # to avoid unnecessary compilation of subset machines.
 ########################################################################
-foreach(arch orc 64 32)
+foreach(arch mmx orc 64 32)
     foreach(machine_name ${available_machines})
         string(REPLACE "_${arch}" "" machine_name_no_arch ${machine_name})
         if (${machine_name} STREQUAL ${machine_name_no_arch})
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index aac676729..2e41c25da 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -20,6 +20,8 @@ VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1);
 //VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000);
 //VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000);
 VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16u_byteswap_u, 0, 0, 20460, 1);
+//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
 VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32f_x2_add_32f_u, 1e-4, 0, 20460, 1);
@@ -34,7 +36,8 @@ VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1);
 VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1);
-VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 2046000, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
 VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1);
 VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1);
@@ -53,6 +56,7 @@ VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460,
 VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1);
 VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 1);
 //VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000);
 VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1);
 VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32767, 20460, 1);