From d825bb2bc1c7bc4105723ae5f699f283bc4f3c44 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Thu, 26 Jan 2012 17:53:49 -0500
Subject: volk: float_to_short now clips the values instead of wrapping around.

---
 volk/include/volk/volk_32f_s32f_convert_16i_a.h | 52 +++++++++++++++++++++----
 volk/include/volk/volk_32f_s32f_convert_16i_u.h | 52 +++++++++++++++++++++----
 2 files changed, 90 insertions(+), 14 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h
index 0a2b4f0f2..10c921b08 100644
--- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h
@@ -21,17 +21,29 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const
     
   const float* inputVectorPtr = (const float*)inputVector;
   int16_t* outputVectorPtr = outputVector;
+
+  float min_val = -32768;
+  float max_val = 32767;
+  float r;
+
   __m128 vScalar = _mm_set_ps1(scalar);
   __m128 inputVal1, inputVal2;
   __m128i intInputVal1, intInputVal2;
+  __m128 ret1, ret2;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
 
   for(;number < eighthPoints; number++){
     inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
     inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
 
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    
+    // Scale and clip
+    ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+    ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+    intInputVal1 = _mm_cvtps_epi32(ret1);
+    intInputVal2 = _mm_cvtps_epi32(ret2);
+
     intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
 
     _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
@@ -40,7 +52,12 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const
 
   number = eighthPoints * 8;    
   for(; number < num_points; number++){
-    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int16_t)(r);
   }
 }
 #endif /* LV_HAVE_SSE2 */
@@ -61,8 +78,15 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const
     
   const float* inputVectorPtr = (const float*)inputVector;
   int16_t* outputVectorPtr = outputVector;
+
+  float min_val = -32768;
+  float max_val = 32767;
+  float r;
+
   __m128 vScalar = _mm_set_ps1(scalar);
   __m128 ret;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
 
   __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
 
@@ -70,7 +94,8 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const
     ret = _mm_load_ps(inputVectorPtr);
     inputVectorPtr += 4;
 
-    ret = _mm_mul_ps(ret, vScalar);
+    // Scale and clip
+    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
 
     _mm_store_ps(outputFloatBuffer, ret);
     *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
@@ -81,7 +106,12 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const
 
   number = quarterPoints * 4;    
   for(; number < num_points; number++){
-    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int16_t)(r);
   }
 }
 #endif /* LV_HAVE_SSE */
@@ -98,9 +128,17 @@ static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, co
   int16_t* outputVectorPtr = outputVector;
   const float* inputVectorPtr = inputVector;
   unsigned int number = 0;
+  float min_val = -32768;
+  float max_val = 32767;
+  float r;
 
   for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar));
+    r  = *inputVectorPtr++ * scalar;
+    if(r < min_val)
+      r = min_val;
+    else if(r > max_val)
+      r = max_val;
+    *outputVectorPtr++ = (int16_t)(r);
   }
 }
 #endif /* LV_HAVE_GENERIC */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h
index dec3f1611..f339a7d10 100644
--- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h
@@ -21,17 +21,29 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const
     
   const float* inputVectorPtr = (const float*)inputVector;
   int16_t* outputVectorPtr = outputVector;
+
+  float min_val = -32768;
+  float max_val = 32767;
+  float r;
+
   __m128 vScalar = _mm_set_ps1(scalar);
   __m128 inputVal1, inputVal2;
   __m128i intInputVal1, intInputVal2;
+  __m128 ret1, ret2;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
 
   for(;number < eighthPoints; number++){
     inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
     inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
 
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    
+    // Scale and clip
+    ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+    ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+    intInputVal1 = _mm_cvtps_epi32(ret1);
+    intInputVal2 = _mm_cvtps_epi32(ret2);
+
     intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
 
     _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
@@ -40,7 +52,12 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const
 
   number = eighthPoints * 8;    
   for(; number < num_points; number++){
-    outputVector[number] = (int16_t)(inputVector[number] * scalar);
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int16_t)(r);
   }
 }
 #endif /* LV_HAVE_SSE2 */
@@ -62,8 +79,15 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const
     
   const float* inputVectorPtr = (const float*)inputVector;
   int16_t* outputVectorPtr = outputVector;
+
+  float min_val = -32768;
+  float max_val = 32767;
+  float r;
+
   __m128 vScalar = _mm_set_ps1(scalar);
   __m128 ret;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
 
   __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
 
@@ -71,7 +95,8 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const
     ret = _mm_loadu_ps(inputVectorPtr);
     inputVectorPtr += 4;
 
-    ret = _mm_mul_ps(ret, vScalar);
+    // Scale and clip
+    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
 
     _mm_store_ps(outputFloatBuffer, ret);
     *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
@@ -82,7 +107,12 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const
 
   number = quarterPoints * 4;    
   for(; number < num_points; number++){
-    outputVector[number] = (int16_t)(inputVector[number] * scalar);
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int16_t)(r);
   }
 }
 #endif /* LV_HAVE_SSE */
@@ -100,9 +130,17 @@ static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, co
   int16_t* outputVectorPtr = outputVector;
   const float* inputVectorPtr = inputVector;
   unsigned int number = 0;
+  float min_val = -32768;
+  float max_val = 32767;
+  float r;
 
   for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++  * scalar));
+    r = *inputVectorPtr++  * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    *outputVectorPtr++ = (int16_t)(r);
   }
 }
 #endif /* LV_HAVE_GENERIC */
-- 
cgit 


From d8b02979cef097971bc0656b904f7b51d19b03c9 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Thu, 26 Jan 2012 20:07:26 -0500
Subject: volk: fix a warning.

---
 volk/include/volk/volk_64u_popcnt_a.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h
index bdaa98643..4683f1e38 100644
--- a/volk/include/volk/volk_64u_popcnt_a.h
+++ b/volk/include/volk/volk_64u_popcnt_a.h
@@ -10,10 +10,11 @@
 
 static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) {
 
-  const uint32_t* valueVector = (const uint32_t*)&value;
+  //const uint32_t* valueVector = (const uint32_t*)&value;
   
   // This is faster than a lookup table
-  uint32_t retVal = valueVector[0];
+  //uint32_t retVal = valueVector[0];
+  uint32_t retVal = (uint32_t)(value && 0x00000000FFFFFFFF);
 
   retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
   retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
@@ -22,7 +23,8 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value
   retVal = (retVal + (retVal >> 16)) & 0x0000003F;
   uint64_t retVal64  = retVal;
 
-  retVal = valueVector[1];
+  //retVal = valueVector[1];
+  retVal = (uint32_t)((value && 0xFFFFFFFF00000000) >> 31);
   retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
   retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
   retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
-- 
cgit 


From 42d9560a50bbbab143b48bda5a73a5379818ddbe Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Thu, 26 Jan 2012 20:07:51 -0500
Subject: volk: float_to_int and float_to_char updated to clip instead of wrap
 around. The float to int clips at smaller than 2^32 because of the limits of
 the float representation.

---
 volk/include/volk/volk_32f_s32f_convert_32i_a.h | 60 ++++++++++++++++++++++---
 volk/include/volk/volk_32f_s32f_convert_32i_u.h | 45 ++++++++++++++++---
 volk/include/volk/volk_32f_s32f_convert_8i_a.h  | 53 ++++++++++++++++++----
 volk/include/volk/volk_32f_s32f_convert_8i_u.h  | 53 ++++++++++++++++++----
 4 files changed, 183 insertions(+), 28 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h
index aa370e614..15fa282fb 100644
--- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h
+++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h
@@ -21,14 +21,22 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const
     
   const float* inputVectorPtr = (const float*)inputVector;
   int32_t* outputVectorPtr = outputVector;
+
+  float min_val = -2147483647;
+  float max_val = 2147483647;
+  float r;
+
   __m256 vScalar = _mm256_set1_ps(scalar);
   __m256 inputVal1;
   __m256i intInputVal1;
+  __m256 vmin_val = _mm256_set1_ps(min_val);
+  __m256 vmax_val = _mm256_set1_ps(max_val);
 
   for(;number < eighthPoints; number++){
     inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
 
-    intInputVal1 = _mm256_cvtps_epi32(_mm256_mul_ps(inputVal1, vScalar));
+    inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+    intInputVal1 = _mm256_cvtps_epi32(inputVal1);
 
     _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
     outputVectorPtr += 8;
@@ -36,7 +44,12 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const
 
   number = eighthPoints * 8;    
   for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int32_t)(r);
   }
 }
 #endif /* LV_HAVE_AVX */
@@ -57,14 +70,22 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const
     
   const float* inputVectorPtr = (const float*)inputVector;
   int32_t* outputVectorPtr = outputVector;
+
+  float min_val = -2147483647;
+  float max_val = 2147483647;
+  float r;
+
   __m128 vScalar = _mm_set_ps1(scalar);
   __m128 inputVal1;
   __m128i intInputVal1;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
 
   for(;number < quarterPoints; number++){
     inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
 
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+    intInputVal1 = _mm_cvtps_epi32(inputVal1);
 
     _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
     outputVectorPtr += 4;
@@ -72,7 +93,12 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const
 
   number = quarterPoints * 4;    
   for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int32_t)(r);
   }
 }
 #endif /* LV_HAVE_SSE2 */
@@ -93,8 +119,15 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const
     
   const float* inputVectorPtr = (const float*)inputVector;
   int32_t* outputVectorPtr = outputVector;
+
+  float min_val = -2147483647;
+  float max_val = 2147483647;
+  float r;
+
   __m128 vScalar = _mm_set_ps1(scalar);
   __m128 ret;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
 
   __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
 
@@ -102,7 +135,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const
     ret = _mm_load_ps(inputVectorPtr);
     inputVectorPtr += 4;
 
-    ret = _mm_mul_ps(ret, vScalar);
+    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
 
     _mm_store_ps(outputFloatBuffer, ret);
     *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
@@ -113,7 +146,12 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const
 
   number = quarterPoints * 4;    
   for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int32_t)(r);
   }
 }
 #endif /* LV_HAVE_SSE */
@@ -130,9 +168,17 @@ static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, co
   int32_t* outputVectorPtr = outputVector;
   const float* inputVectorPtr = inputVector;
   unsigned int number = 0;
+  float min_val = -2147483647;
+  float max_val = 2147483647;
+  float r;
 
   for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
+    r = *inputVectorPtr++ * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    *outputVectorPtr++ = (int32_t)(r);
   }
 }
 #endif /* LV_HAVE_GENERIC */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h
index b4e954dc4..d8493454b 100644
--- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h
+++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h
@@ -21,14 +21,24 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const
     
   const float* inputVectorPtr = (const float*)inputVector;
   int32_t* outputVectorPtr = outputVector;
+
+  //float min_val = -2147483647;
+  //float max_val = 2147483647;
+  float min_val = -2146400000;
+  float max_val = 2146400000;
+  float r;
+
   __m128 vScalar = _mm_set_ps1(scalar);
   __m128 inputVal1;
   __m128i intInputVal1;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
 
   for(;number < quarterPoints; number++){
     inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
 
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+    intInputVal1 = _mm_cvtps_epi32(inputVal1);
 
     _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
     outputVectorPtr += 4;
@@ -36,7 +46,12 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const
 
   number = quarterPoints * 4;    
   for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int32_t)(r);
   }
 }
 #endif /* LV_HAVE_SSE2 */
@@ -58,8 +73,15 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const
     
   const float* inputVectorPtr = (const float*)inputVector;
   int32_t* outputVectorPtr = outputVector;
+
+  float min_val = -2147483647;
+  float max_val = 2147483647;
+  float r;
+
   __m128 vScalar = _mm_set_ps1(scalar);
   __m128 ret;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
 
   __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
 
@@ -67,7 +89,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const
     ret = _mm_loadu_ps(inputVectorPtr);
     inputVectorPtr += 4;
 
-    ret = _mm_mul_ps(ret, vScalar);
+    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
 
     _mm_store_ps(outputFloatBuffer, ret);
     *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
@@ -78,7 +100,12 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const
 
   number = quarterPoints * 4;    
   for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int32_t)(r);
   }
 }
 #endif /* LV_HAVE_SSE */
@@ -96,9 +123,17 @@ static inline void volk_32f_s32f_convert_32i_u_generic(int32_t* outputVector, co
   int32_t* outputVectorPtr = outputVector;
   const float* inputVectorPtr = inputVector;
   unsigned int number = 0;
+  float min_val = -2147483647;
+  float max_val = 2147483647;
+  float r;
 
   for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
+    r = *inputVectorPtr++ * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    *outputVectorPtr++ = (int32_t)(r);
   }
 }
 #endif /* LV_HAVE_GENERIC */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h
index 8d87a07d7..05172171c 100644
--- a/volk/include/volk/volk_32f_s32f_convert_8i_a.h
+++ b/volk/include/volk/volk_32f_s32f_convert_8i_a.h
@@ -21,9 +21,16 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f
     
   const float* inputVectorPtr = (const float*)inputVector;
   int8_t* outputVectorPtr = outputVector;
+
+  float min_val = -128;
+  float max_val = 127;
+  float r;
+
   __m128 vScalar = _mm_set_ps1(scalar);
   __m128 inputVal1, inputVal2, inputVal3, inputVal4;
   __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
 
   for(;number < sixteenthPoints; number++){
     inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
@@ -31,10 +38,15 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f
     inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
     inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
 
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
-    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
+    inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+    inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+    inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+    inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+    intInputVal1 = _mm_cvtps_epi32(inputVal1);
+    intInputVal2 = _mm_cvtps_epi32(inputVal2);
+    intInputVal3 = _mm_cvtps_epi32(inputVal3);
+    intInputVal4 = _mm_cvtps_epi32(inputVal4);
     
     intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
     intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
@@ -47,7 +59,12 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f
 
   number = sixteenthPoints * 16;    
   for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int8_t)(r);
   }
 }
 #endif /* LV_HAVE_SSE2 */
@@ -67,9 +84,16 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl
   const unsigned int quarterPoints = num_points / 4;
     
   const float* inputVectorPtr = (const float*)inputVector;
+
+  float min_val = -128;
+  float max_val = 127;
+  float r;
+
   int8_t* outputVectorPtr = outputVector;
   __m128 vScalar = _mm_set_ps1(scalar);
   __m128 ret;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
 
   __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
 
@@ -77,7 +101,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl
     ret = _mm_load_ps(inputVectorPtr);
     inputVectorPtr += 4;
 
-    ret = _mm_mul_ps(ret, vScalar);
+    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
 
     _mm_store_ps(outputFloatBuffer, ret);
     *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
@@ -88,7 +112,12 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl
 
   number = quarterPoints * 4;    
   for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int8_t)(r);
   }
 }
 #endif /* LV_HAVE_SSE */
@@ -105,9 +134,17 @@ static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, cons
   int8_t* outputVectorPtr = outputVector;
   const float* inputVectorPtr = inputVector;
   unsigned int number = 0;
+  float min_val = -128;
+  float max_val = 127;
+  float r;
 
   for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = (int8_t)(*inputVectorPtr++  * scalar);
+    r = *inputVectorPtr++ * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    *outputVectorPtr++ = (int8_t)(r);
   }
 }
 #endif /* LV_HAVE_GENERIC */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h
index 1c6bf87c9..12991e9c1 100644
--- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h
+++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h
@@ -21,9 +21,16 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f
     
   const float* inputVectorPtr = (const float*)inputVector;
   int8_t* outputVectorPtr = outputVector;
+
+  float min_val = -128;
+  float max_val = 127;
+  float r;
+
   __m128 vScalar = _mm_set_ps1(scalar);
   __m128 inputVal1, inputVal2, inputVal3, inputVal4;
   __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
 
   for(;number < sixteenthPoints; number++){
     inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
@@ -31,10 +38,15 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f
     inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
     inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
 
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
-    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
+    inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+    inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+    inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+    inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+    intInputVal1 = _mm_cvtps_epi32(inputVal1);
+    intInputVal2 = _mm_cvtps_epi32(inputVal2);
+    intInputVal3 = _mm_cvtps_epi32(inputVal3);
+    intInputVal4 = _mm_cvtps_epi32(inputVal4);
     
     intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
     intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
@@ -47,7 +59,12 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f
 
   number = sixteenthPoints * 16;    
   for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int16_t)(r);
   }
 }
 #endif /* LV_HAVE_SSE2 */
@@ -69,8 +86,15 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl
     
   const float* inputVectorPtr = (const float*)inputVector;
   int8_t* outputVectorPtr = outputVector;
+
+  float min_val = -128;
+  float max_val = 127;
+  float r;
+
   __m128 vScalar = _mm_set_ps1(scalar);
   __m128 ret;
+  __m128 vmin_val = _mm_set_ps1(min_val);
+  __m128 vmax_val = _mm_set_ps1(max_val);
 
   __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
 
@@ -78,7 +102,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl
     ret = _mm_loadu_ps(inputVectorPtr);
     inputVectorPtr += 4;
 
-    ret = _mm_mul_ps(ret, vScalar);
+    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
 
     _mm_store_ps(outputFloatBuffer, ret);
     *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
@@ -89,7 +113,12 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl
 
   number = quarterPoints * 4;    
   for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+    r = inputVector[number] * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    outputVector[number] = (int16_t)(r);
   }
 }
 #endif /* LV_HAVE_SSE */
@@ -107,9 +136,17 @@ static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, cons
   int8_t* outputVectorPtr = outputVector;
   const float* inputVectorPtr = inputVector;
   unsigned int number = 0;
+  float min_val = -128;
+  float max_val = 127;
+  float r;
 
   for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
+    r = *inputVectorPtr++ * scalar;
+    if(r > max_val)
+      r = max_val;
+    else if(r < min_val)
+      r = min_val;
+    *outputVectorPtr++ = (int16_t)(r);
   }
 }
 #endif /* LV_HAVE_GENERIC */
-- 
cgit 


From d870487437b33d1a26eb8f5f5aaa414ca6ed24e0 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Sat, 28 Jan 2012 19:31:33 -0500
Subject: volk: fix lower bound of int conversion.

---
 volk/include/volk/volk_32f_s32f_convert_32i_a.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h
index 15fa282fb..8f2fc791e 100644
--- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h
+++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h
@@ -22,7 +22,7 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const
   const float* inputVectorPtr = (const float*)inputVector;
   int32_t* outputVectorPtr = outputVector;
 
-  float min_val = -2147483647;
+  float min_val = -2147483648;
   float max_val = 2147483647;
   float r;
 
@@ -71,7 +71,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const
   const float* inputVectorPtr = (const float*)inputVector;
   int32_t* outputVectorPtr = outputVector;
 
-  float min_val = -2147483647;
+  float min_val = -2147483648;
   float max_val = 2147483647;
   float r;
 
-- 
cgit 


From 9c9b9a8aa05c7cde7cd11cd5d1e052630715d252 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Sun, 29 Jan 2012 17:31:37 -0500
Subject: volk: added unaligned volk function for magnitude of a complex
 number.

---
 volk/include/volk/volk_32fc_magnitude_32f_u.h | 118 ++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 volk/include/volk/volk_32fc_magnitude_32f_u.h

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32fc_magnitude_32f_u.h b/volk/include/volk/volk_32fc_magnitude_32f_u.h
new file mode 100644
index 000000000..ed1cedef9
--- /dev/null
+++ b/volk/include/volk/volk_32fc_magnitude_32f_u.h
@@ -0,0 +1,118 @@
+#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
+#define INCLUDED_volk_32fc_magnitude_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+      cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+      result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+      result = _mm_sqrt_ps(result);
+
+      _mm_storeu_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      float val1Real = *complexVectorPtr++;
+      float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      // Arrange in i1i2i3i4 format
+      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+      // Arrange in q1q2q3q4 format
+      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+      iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+      result = _mm_sqrt_ps(result);
+
+      _mm_storeu_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+       float val1Real = *complexVectorPtr++;
+       float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
-- 
cgit 


From d142fd9e31715bae85d65c4790f278143a784142 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Sun, 29 Jan 2012 17:32:30 -0500
Subject: volk: added volk magnitiude squared functions (aligned/unaligned) for
 complex numbers.

---
 volk/include/volk/Makefile.am                      |   3 +
 .../volk/volk_32fc_magnitude_squared_32f_a.h       | 114 +++++++++++++++++++++
 .../volk/volk_32fc_magnitude_squared_32f_u.h       | 114 +++++++++++++++++++++
 3 files changed, 231 insertions(+)
 create mode 100644 volk/include/volk/volk_32fc_magnitude_squared_32f_a.h
 create mode 100644 volk/include/volk/volk_32fc_magnitude_squared_32f_u.h

(limited to 'volk/include')

diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index e7333a015..c2502f6e6 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -71,6 +71,9 @@ volkinclude_HEADERS = \
 	volk_32fc_index_max_16u_a.h \
 	volk_32fc_s32f_magnitude_16i_a.h \
 	volk_32fc_magnitude_32f_a.h \
+	volk_32fc_magnitude_32f_u.h \
+	volk_32fc_magnitude_squared_32f_a.h \
+	volk_32fc_magnitude_squared_32f_u.h \
 	volk_32fc_x2_multiply_32fc_a.h \
 	volk_32f_s32f_convert_16i_a.h \
 	volk_32f_s32f_convert_16i_u.h \
diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h
new file mode 100644
index 000000000..00bdefbb5
--- /dev/null
+++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h
@@ -0,0 +1,114 @@
+#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+      cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+      result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+      _mm_store_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      float val1Real = *complexVectorPtr++;
+      float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      // Arrange in i1i2i3i4 format
+      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+      // Arrange in q1q2q3q4 format
+      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+      iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+      _mm_store_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+       float val1Real = *complexVectorPtr++;
+       float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h
new file mode 100644
index 000000000..6eb4a523a
--- /dev/null
+++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h
@@ -0,0 +1,114 @@
+#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+      cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+      result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+      _mm_storeu_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      float val1Real = *complexVectorPtr++;
+      float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      // Arrange in i1i2i3i4 format
+      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+      // Arrange in q1q2q3q4 format
+      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+      iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+      _mm_storeu_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+       float val1Real = *complexVectorPtr++;
+       float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_squared_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (real*real) + (imag*imag);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
-- 
cgit 


From cb458204245632ac7a571bfe96b90d3c55a2f01a Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Mon, 30 Jan 2012 00:22:44 -0500
Subject: volk: adding complex to imag kernel.

---
 volk/include/volk/Makefile.am                      |  1 +
 .../volk/volk_32fc_deinterleave_imag_32f_a.h       | 68 ++++++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h

(limited to 'volk/include')

diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index c2502f6e6..c5b99c41b 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -65,6 +65,7 @@ volkinclude_HEADERS = \
 	volk_32fc_deinterleave_64f_x2_a.h \
 	volk_32fc_s32f_deinterleave_real_16i_a.h \
 	volk_32fc_deinterleave_real_32f_a.h \
+	volk_32fc_deinterleave_imag_32f_a.h \
 	volk_32fc_deinterleave_real_64f_a.h \
 	volk_32fc_x2_dot_prod_32fc_a.h \
 	volk_32fc_x2_dot_prod_32fc_u.h \
diff --git a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h
new file mode 100644
index 000000000..adc4112b9
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h
@@ -0,0 +1,68 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_imag_32f_a_H
+#define INCLUDED_volk_32fc_deinterleave_imag_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex vector into Q vector data
+  \param complexVector The complex input vector
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (const float*)complexVector;
+  float* qBufferPtr = qBuffer;
+
+  __m128 cplxValue1, cplxValue2, iValue;
+  for(;number < quarterPoints; number++){
+      
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in q1q2q3q4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    _mm_store_ps(qBufferPtr, iValue);
+
+    qBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex vector into Q vector data
+  \param complexVector The complex input vector
+  \param qBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_imag_32f_a_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const float* complexVectorPtr = (float*)complexVector;
+  float* qBufferPtr = qBuffer;
+  for(number = 0; number < num_points; number++){
+    complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_a_H */
-- 
cgit 


From 070a6c9ce93c2f2043a0145e253cc55f801f9433 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Thu, 2 Feb 2012 14:25:53 -0500
Subject: volk: adding unaligned versions of complex multiply a constant and
 complex multiply 2 streams.

---
 .../include/volk/volk_32fc_s32fc_multiply_32fc_u.h | 42 ++++++++++++
 volk/include/volk/volk_32fc_x2_multiply_32fc_u.h   | 77 ++++++++++++++++++++++
 2 files changed, 119 insertions(+)
 create mode 100644 volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
 create mode 100644 volk/include/volk/volk_32fc_x2_multiply_32fc_u.h

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
new file mode 100644
index 000000000..201dcf5f6
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
@@ -0,0 +1,42 @@
+#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = num_points;
+
+    // unwrap loop
+    while (number >= 8){
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      number -= 8;
+    }
+
+    // clean up any remaining
+    while (number-- > 0)
+      *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h
new file mode 100644
index 000000000..729c1a4ad
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for(;number < halfPoints; number++){
+      
+      x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+      y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+      
+      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+      
+      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+      
+      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+      
+      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+      
+      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+   
+      _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+      a += 2;
+      b += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = (*a) * (*b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
-- 
cgit 


From 67d23bdecd3f15197bdf46f8c0cd66a6f754fea5 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Thu, 2 Feb 2012 16:35:14 -0500
Subject: volk: improving performance of multiply_const and multiply two
 streams.

---
 .../include/volk/volk_32fc_s32fc_multiply_32fc_a.h | 63 +++++++++++++++++++++-
 .../include/volk/volk_32fc_s32fc_multiply_32fc_u.h | 45 ++++++++++++++++
 volk/include/volk/volk_32fc_x2_multiply_32fc_a.h   |  1 -
 3 files changed, 106 insertions(+), 3 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
index b27a7259f..205461afb 100644
--- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
+++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
@@ -6,6 +6,52 @@
 #include <volk/volk_complex.h>
 #include <float.h>
 
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+  unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm_set_ps1(lv_creal(scalar));
+    yh = _mm_set_ps1(lv_cimag(scalar));
+
+    for(;number < halfPoints; number++){
+      
+      x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+      
+      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+      
+      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+      
+      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+      
+      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+    
+      _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+      a += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = (*a) * scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
 #ifdef LV_HAVE_GENERIC
   /*!
     \brief Multiplies the two input complex vectors and stores their results in the third vector
@@ -17,11 +63,24 @@
 static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
     lv_32fc_t* cPtr = cVector;
     const lv_32fc_t* aPtr = aVector;
-    unsigned int number = 0;
+    unsigned int number = num_points;
 
-    for(number = 0; number < num_points; number++){
+    // unwrap loop
+    while (number >= 8){
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
       *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      *cPtr++ = (*aPtr++) * scalar;
+      number -= 8;
     }
+
+    // clean up any remaining
+    while (number-- > 0)
+      *cPtr++ = *aPtr++ * scalar;
 }
 #endif /* LV_HAVE_GENERIC */
 
diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
index 201dcf5f6..a9dfcda19 100644
--- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
+++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
@@ -6,6 +6,51 @@
 #include <volk/volk_complex.h>
 #include <float.h>
 
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+  unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm_set_ps1(lv_creal(scalar));
+    yh = _mm_set_ps1(lv_cimag(scalar));
+
+    for(;number < halfPoints; number++){
+      
+      x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+      
+      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+      
+      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+      
+      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+      
+      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+    
+      _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+      a += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = (*a) * scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
 #ifdef LV_HAVE_GENERIC
   /*!
     \brief Multiplies the two input complex vectors and stores their results in the third vector
diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h
index 18dd092e8..aec8bd716 100644
--- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h
+++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h
@@ -23,7 +23,6 @@ static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const l
     lv_32fc_t* c = cVector;
     const lv_32fc_t* a = aVector;
     const lv_32fc_t* b = bVector;
-
     for(;number < halfPoints; number++){
       
       x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-- 
cgit 


From b3e538493f969526c704a3eb1b3c8d8c61be78f6 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Thu, 2 Feb 2012 16:38:36 -0500
Subject: volk: adding new functions to Makefile.

---
 volk/include/volk/Makefile.am | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'volk/include')

diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index c5b99c41b..eea006bf2 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -56,6 +56,7 @@ volkinclude_HEADERS = \
 	volk_32f_s32f_multiply_32f_a.h \
 	volk_32fc_32f_multiply_32fc_a.h \
 	volk_32fc_s32fc_multiply_32fc_a.h \
+	volk_32fc_s32fc_multiply_32fc_u.h \
 	volk_32fc_s32f_power_32fc_a.h \
 	volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \
 	volk_32fc_s32f_atan2_32f_a.h \
@@ -76,6 +77,7 @@ volkinclude_HEADERS = \
 	volk_32fc_magnitude_squared_32f_a.h \
 	volk_32fc_magnitude_squared_32f_u.h \
 	volk_32fc_x2_multiply_32fc_a.h \
+	volk_32fc_x2_multiply_32fc_u.h \
 	volk_32f_s32f_convert_16i_a.h \
 	volk_32f_s32f_convert_16i_u.h \
 	volk_32f_s32f_convert_32i_a.h \
-- 
cgit 


From ae663decab658be25ac01072fa2f5c8454bd6167 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Thu, 2 Feb 2012 17:26:39 -0500
Subject: core: moving multiply_const_ff from gengen to general to take
 advantage of volk.

Also adds SSE and AVX and unaligned Volk versions for this.
---
 volk/include/volk/volk_32f_s32f_multiply_32f_a.h   | 75 ++++++++++++++++++++++
 .../include/volk/volk_32fc_s32fc_multiply_32fc_u.h | 12 ++--
 2 files changed, 81 insertions(+), 6 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h
index 37223dc81..d1c6f3f65 100644
--- a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h
+++ b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h
@@ -4,6 +4,81 @@
 #include <inttypes.h>
 #include <stdio.h>
 
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Scalar float multiply
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param scalar the scalar value
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, bVal, cVal;
+    bVal = _mm_set_ps1(scalar);
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      
+      cVal = _mm_mul_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Scalar float multiply
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param scalar the scalar value
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, bVal, cVal;
+    bVal = _mm256_set1_ps(scalar);
+    for(;number < eighthPoints; number++){
+      
+      aVal = _mm256_load_ps(aPtr); 
+      
+      cVal = _mm256_mul_ps(aVal, bVal); 
+      
+      _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 8;
+      cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Scalar float multiply
diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
index a9dfcda19..450a89066 100644
--- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
+++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
@@ -9,10 +9,10 @@
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
   /*!
-    \brief Multiplies the two input complex vectors and stores their results in the third vector
+  \brief Multiplies the input vector by a scalar and stores the results in the third vector
     \param cVector The vector where the results will be stored
-    \param aVector One of the vectors to be multiplied
-    \param bVector One of the vectors to be multiplied
+    \param aVector The vector to be multiplied
+    \param scalar The complex scalar to multiply aVector
     \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
   */
 static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
@@ -53,10 +53,10 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, cons
 
 #ifdef LV_HAVE_GENERIC
   /*!
-    \brief Multiplies the two input complex vectors and stores their results in the third vector
+  \brief Multiplies the input vector by a scalar and stores the results in the third vector
     \param cVector The vector where the results will be stored
-    \param aVector One of the vectors to be multiplied
-    \param bVector One of the vectors to be multiplied
+    \param aVector The vector to be multiplied
+    \param scalar The complex scalar to multiply aVector
     \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
   */
 static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
-- 
cgit 


From f028a198cb47e0486ad41a29bd3b3dcf0663d766 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Sat, 4 Feb 2012 11:04:30 -0500
Subject: volk: new unaligned versions of float multipliers.

---
 volk/include/volk/Makefile.am                      |   2 +
 volk/include/volk/volk_32f_s32f_multiply_32f_u.h   | 102 ++++++++++++++++++++
 volk/include/volk/volk_32f_x2_multiply_32f_u.h     | 106 +++++++++++++++++++++
 .../include/volk/volk_32fc_s32fc_multiply_32fc_u.h |  24 ++---
 4 files changed, 222 insertions(+), 12 deletions(-)
 create mode 100644 volk/include/volk/volk_32f_s32f_multiply_32f_u.h
 create mode 100644 volk/include/volk/volk_32f_x2_multiply_32f_u.h

(limited to 'volk/include')

diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index eea006bf2..312ff2d5a 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -54,6 +54,7 @@ volkinclude_HEADERS = \
 	volk_32f_accumulator_s32f_a.h \
 	volk_32f_x2_add_32f_a.h \
 	volk_32f_s32f_multiply_32f_a.h \
+	volk_32f_s32f_multiply_32f_u.h \
 	volk_32fc_32f_multiply_32fc_a.h \
 	volk_32fc_s32fc_multiply_32fc_a.h \
 	volk_32fc_s32fc_multiply_32fc_u.h \
@@ -100,6 +101,7 @@ volkinclude_HEADERS = \
 	volk_32f_x2_max_32f_a.h \
 	volk_32f_x2_min_32f_a.h \
 	volk_32f_x2_multiply_32f_a.h \
+	volk_32f_x2_multiply_32f_u.h \
 	volk_32f_s32f_normalize_a.h \
 	volk_32f_s32f_power_32f_a.h \
 	volk_32f_sqrt_32f_a.h \
diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h
new file mode 100644
index 000000000..0e700060f
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h
@@ -0,0 +1,102 @@
+#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
+#define INCLUDED_volk_32f_s32f_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Scalar float multiply
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param scalar the scalar value
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, bVal, cVal;
+    bVal = _mm_set_ps1(scalar);
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_loadu_ps(aPtr); 
+      
+      cVal = _mm_mul_ps(aVal, bVal); 
+      
+      _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Scalar float multiply
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param scalar the scalar value
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, bVal, cVal;
+    bVal = _mm256_set1_ps(scalar);
+    for(;number < eighthPoints; number++){
+      
+      aVal = _mm256_loadu_ps(aPtr); 
+      
+      cVal = _mm256_mul_ps(aVal, bVal); 
+      
+      _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 8;
+      cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Scalar float multiply
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param scalar the scalar value
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const float* inputPtr = aVector;
+  float* outputPtr = cVector;
+  for(number = 0; number < num_points; number++){
+    *outputPtr = (*inputPtr) * scalar;
+    inputPtr++;
+    outputPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_u.h b/volk/include/volk/volk_32f_x2_multiply_32f_u.h
new file mode 100644
index 000000000..6c3ce5d83
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_multiply_32f_u.h
@@ -0,0 +1,106 @@
+#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
+#define INCLUDED_volk_32f_x2_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_loadu_ps(aPtr); 
+      bVal = _mm_loadu_ps(bPtr);
+      
+      cVal = _mm_mul_ps(aVal, bVal); 
+      
+      _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Multiplies the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m256 aVal, bVal, cVal;
+    for(;number < eighthPoints; number++){
+      
+      aVal = _mm256_loadu_ps(aPtr); 
+      bVal = _mm256_loadu_ps(bPtr);
+      
+      cVal = _mm256_mul_ps(aVal, bVal); 
+      
+      _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 8;
+      bPtr += 8;
+      cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
index 450a89066..218c450f8 100644
--- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
+++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h
@@ -8,13 +8,13 @@
 
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
-  /*!
+/*!
   \brief Multiplies the input vector by a scalar and stores the results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector The vector to be multiplied
-    \param scalar The complex scalar to multiply aVector
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be multiplied
+  \param scalar The complex scalar to multiply aVector
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
 static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
   unsigned int number = 0;
     const unsigned int halfPoints = num_points / 2;
@@ -52,13 +52,13 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, cons
 #endif /* LV_HAVE_SSE */
 
 #ifdef LV_HAVE_GENERIC
-  /*!
+/*!
   \brief Multiplies the input vector by a scalar and stores the results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector The vector to be multiplied
-    \param scalar The complex scalar to multiply aVector
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be multiplied
+  \param scalar The complex scalar to multiply aVector
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
 static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
     lv_32fc_t* cPtr = cVector;
     const lv_32fc_t* aPtr = aVector;
-- 
cgit 


From 298623615b249de459cd12b5507dab921fc3210b Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Mon, 6 Feb 2012 10:31:28 -0500
Subject: volk: added unaligned version of adding 2 vectors.

---
 volk/include/volk/Makefile.am             |  1 +
 volk/include/volk/volk_32f_x2_add_32f_u.h | 66 +++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 volk/include/volk/volk_32f_x2_add_32f_u.h

(limited to 'volk/include')

diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index 312ff2d5a..20864efbe 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -53,6 +53,7 @@ volkinclude_HEADERS = \
 	volk_16u_byteswap_a.h \
 	volk_32f_accumulator_s32f_a.h \
 	volk_32f_x2_add_32f_a.h \
+	volk_32f_x2_add_32f_u.h \
 	volk_32f_s32f_multiply_32f_a.h \
 	volk_32f_s32f_multiply_32f_u.h \
 	volk_32fc_32f_multiply_32fc_a.h \
diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h
new file mode 100644
index 000000000..e360a7958
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_add_32f_u.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_volk_32f_x2_add_32f_u_H
+#define INCLUDED_volk_32f_x2_add_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_loadu_ps(aPtr); 
+      bVal = _mm_loadu_ps(bPtr);
+      
+      cVal = _mm_add_ps(aVal, bVal); 
+      
+      _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
-- 
cgit 


From 69210086c7ae98b93a63a1d810ee28b304a13520 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Mon, 6 Feb 2012 22:02:09 -0500
Subject: volk: added a 32fc multiply conjugate kernel.

---
 volk/include/volk/Makefile.am                      |  2 +
 .../volk/volk_32fc_x2_multiply_conjugate_32fc_a.h  | 82 ++++++++++++++++++++++
 .../volk/volk_32fc_x2_multiply_conjugate_32fc_u.h  | 81 +++++++++++++++++++++
 3 files changed, 165 insertions(+)
 create mode 100644 volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
 create mode 100644 volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h

(limited to 'volk/include')

diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index 20864efbe..d071f18f2 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -59,6 +59,8 @@ volkinclude_HEADERS = \
 	volk_32fc_32f_multiply_32fc_a.h \
 	volk_32fc_s32fc_multiply_32fc_a.h \
 	volk_32fc_s32fc_multiply_32fc_u.h \
+	volk_32fc_s32fc_multiply_conjugate_32fc_a.h \
+	volk_32fc_s32fc_multiply_conjugate_32fc_u.h \
 	volk_32fc_s32f_power_32fc_a.h \
 	volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \
 	volk_32fc_s32f_atan2_32f_a.h \
diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
new file mode 100644
index 000000000..70476a8c7
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
@@ -0,0 +1,82 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector First vector to be multiplied
+    \param bVector Second vector that is conjugated before being multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    __m128 conjugator = _mm_setr_ps(1, -1, 1, -1);
+
+    for(;number < halfPoints; number++){
+      
+      x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+      y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+      // FIXME: replace with xor for a faster implementation
+      y = _mm_mul_ps(y, conjugator); // conjugate y
+      
+      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+      
+      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+      
+      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+      
+      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+      
+      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+    
+      _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+      a += 2;
+      b += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = (*a) * lv_conj(*b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector First vector to be multiplied
+    \param bVector Second vector that is conjugated before being multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h
new file mode 100644
index 000000000..fbaa29c17
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector First vector to be multiplied
+    \param bVector Second vector that is conjugated before being multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    __m128 conjugator = _mm_set_ps(0, 0x80000000, 0, 0x80000000);
+
+    for(;number < halfPoints; number++){
+      
+      x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+      y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+      y = _mm_xor_ps(y, conjugator); // conjugate y
+      
+      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+      
+      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+      
+      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+      
+      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+      
+      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+    
+      _mm_storeu_ps((float*)c,z); // Store the results back into the C container
+
+      a += 2;
+      b += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = (*a) * lv_conj(*b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector First vector to be multiplied
+    \param bVector Second vector that is conjugated before being multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
-- 
cgit 


From cdb328758dca9fa494956c0e62f5e78adf613982 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Tue, 7 Feb 2012 16:59:50 -0500
Subject: volk: fixed complex multiply and conjugate kernel to use xor for
 conjugation.

---
 volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h | 5 ++---
 volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
index 70476a8c7..2a1bcbce0 100644
--- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
+++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h
@@ -24,15 +24,14 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVecto
     const lv_32fc_t* a = aVector;
     const lv_32fc_t* b = bVector;
 
-    __m128 conjugator = _mm_setr_ps(1, -1, 1, -1);
+    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
 
     for(;number < halfPoints; number++){
       
       x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
       y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
 
-      // FIXME: replace with xor for a faster implementation
-      y = _mm_mul_ps(y, conjugator); // conjugate y
+      y = _mm_xor_ps(y, conjugator); // conjugate y
       
       yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
       yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h
index fbaa29c17..92f6a051e 100644
--- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h
+++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h
@@ -24,7 +24,7 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVecto
     const lv_32fc_t* a = aVector;
     const lv_32fc_t* b = bVector;
 
-    __m128 conjugator = _mm_set_ps(0, 0x80000000, 0, 0x80000000);
+    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
 
     for(;number < halfPoints; number++){
       
-- 
cgit 


From 3080cd75a6a10aab757e1e02fb99e81e2f3724d5 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Tue, 7 Feb 2012 17:23:44 -0500
Subject: volk: adding complex conjugate kernel.

---
 volk/include/volk/Makefile.am                  |  5 +-
 volk/include/volk/volk_32fc_conjugate_32fc_a.h | 64 ++++++++++++++++++++++++++
 volk/include/volk/volk_32fc_conjugate_32fc_u.h | 64 ++++++++++++++++++++++++++
 3 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 volk/include/volk/volk_32fc_conjugate_32fc_a.h
 create mode 100644 volk/include/volk/volk_32fc_conjugate_32fc_u.h

(limited to 'volk/include')

diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index d071f18f2..f6b5835b1 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -134,4 +134,7 @@ volkinclude_HEADERS = \
 	volk_8i_convert_16i_a.h \
 	volk_8i_convert_16i_u.h \
 	volk_8i_s32f_convert_32f_a.h \
-	volk_8i_s32f_convert_32f_u.h 
+	volk_8i_s32f_convert_32f_u.h \
+	volk_32fc_conjugate_32fc_a.h \
+	volk_32fc_conjugate_32fc_u.h
+
diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_conjugate_32fc_a.h
new file mode 100644
index 000000000..1518af9be
--- /dev/null
+++ b/volk/include/volk/volk_32fc_conjugate_32fc_a.h
@@ -0,0 +1,64 @@
+#ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H
+#define INCLUDED_volk_32fc_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Takes the conjugate of a complex vector.
+    \param cVector The vector where the results will be stored
+    \param aVector Vector to be conjugated
+    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+  */
+static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+    for(;number < halfPoints; number++){
+      
+      x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+      x = _mm_xor_ps(x, conjugator); // conjugate register
+    
+      _mm_store_ps((float*)c,x); // Store the results back into the C container
+
+      a += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = lv_conj(*a);
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Takes the conjugate of a complex vector.
+    \param cVector The vector where the results will be stored
+    \param aVector Vector to be conjugated
+    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+  */
+static inline void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = lv_conj(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */
diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_conjugate_32fc_u.h
new file mode 100644
index 000000000..b26fe0789
--- /dev/null
+++ b/volk/include/volk/volk_32fc_conjugate_32fc_u.h
@@ -0,0 +1,64 @@
+#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
+#define INCLUDED_volk_32fc_conjugate_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Takes the conjugate of a complex vector.
+    \param cVector The vector where the results will be stored
+    \param aVector Vector to be conjugated
+    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+  */
+static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+  
+    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+    for(;number < halfPoints; number++){
+      
+      x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+      
+      x = _mm_xor_ps(x, conjugator); // conjugate register
+
+      _mm_storeu_ps((float*)c,x); // Store the results back into the C container
+
+      a += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = lv_conj(*a);
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Takes the conjugate of a complex vector.
+    \param cVector The vector where the results will be stored
+    \param aVector Vector to be conjugated
+    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
+  */
+static inline void volk_32fc_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = lv_conj(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */
-- 
cgit 


From 2eaa0a6e1e57cfc374c258c317ecb469fc49bf53 Mon Sep 17 00:00:00 2001
From: Johnathan Corgan
Date: Tue, 14 Feb 2012 15:37:57 -0800
Subject: build: fix autotools for gnuradio-core volkification

---
 volk/include/volk/Makefile.am | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index f6b5835b1..a01ddf193 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -59,8 +59,8 @@ volkinclude_HEADERS = \
 	volk_32fc_32f_multiply_32fc_a.h \
 	volk_32fc_s32fc_multiply_32fc_a.h \
 	volk_32fc_s32fc_multiply_32fc_u.h \
-	volk_32fc_s32fc_multiply_conjugate_32fc_a.h \
-	volk_32fc_s32fc_multiply_conjugate_32fc_u.h \
+	volk_32fc_x2_multiply_conjugate_32fc_a.h \
+	volk_32fc_x2_multiply_conjugate_32fc_u.h \
 	volk_32fc_s32f_power_32fc_a.h \
 	volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \
 	volk_32fc_s32f_atan2_32f_a.h \
-- 
cgit 


From fa8ab7cb146287a9f0d8db67e3126ab4a867a201 Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Tue, 21 Feb 2012 15:41:27 -0800
Subject: Volk: add scalar const support to the profiler/QA code. Disabled
 volk_32fc_s32fc_multiply_32fc_a's Orc impl due to it not working.

---
 volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
index b27a7259f..75cf8c8b2 100644
--- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
+++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
@@ -34,9 +34,9 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, c
     \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
   */
 extern void volk_32fc_s32fc_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points);
-static inline void volk_32fc_s32fc_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
-    volk_32fc_s32fc_multiply_32fc_a_orc_impl(cVector, aVector, scalar, num_points);
-}
+//static inline void volk_32fc_s32fc_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+//    volk_32fc_s32fc_multiply_32fc_a_orc_impl(cVector, aVector, scalar, num_points);
+//}
 #endif /* LV_HAVE_ORC */
 
 
-- 
cgit 


From 330cddf31208b843c0997c6fb05cb3facf31f536 Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Wed, 22 Feb 2012 08:46:55 -0800
Subject: Remove ORC invocation since // doesn't dissuade the generator.

---
 volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
index 75cf8c8b2..665fad47a 100644
--- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
+++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h
@@ -25,20 +25,6 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, c
 }
 #endif /* LV_HAVE_GENERIC */
 
-#ifdef LV_HAVE_ORC
-  /*!
-    \brief Multiplies the two input complex vectors and stores their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector One of the vectors to be multiplied
-    \param bVector One of the vectors to be multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-extern void volk_32fc_s32fc_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points);
-//static inline void volk_32fc_s32fc_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
-//    volk_32fc_s32fc_multiply_32fc_a_orc_impl(cVector, aVector, scalar, num_points);
-//}
-#endif /* LV_HAVE_ORC */
-
 
 
-- 
cgit 


From e8d644872837f4cbfc05851710531b2ac5259806 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Thu, 23 Feb 2012 14:28:21 -0500
Subject: volk: float to short conversion is consistent between archs and tail
 cases. Rounds to nearest number.

---
 volk/include/volk/volk_32f_s32f_convert_16i_a.h | 15 ++++++++-------
 volk/include/volk/volk_32f_s32f_convert_16i_u.h | 15 ++++++++-------
 2 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h
index 10c921b08..a24959678 100644
--- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h
@@ -4,6 +4,7 @@
 #include <volk/volk_common.h>
 #include <inttypes.h>
 #include <stdio.h>
+#include <math.h>
 
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>
@@ -57,7 +58,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const
       r = max_val;
     else if(r < min_val)
       r = min_val;
-    outputVector[number] = (int16_t)(r);
+    outputVector[number] = (int16_t)rintf(r);
   }
 }
 #endif /* LV_HAVE_SSE2 */
@@ -98,10 +99,10 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const
     ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
 
     _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
   }
 
   number = quarterPoints * 4;    
@@ -111,7 +112,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const
       r = max_val;
     else if(r < min_val)
       r = min_val;
-    outputVector[number] = (int16_t)(r);
+    outputVector[number] = (int16_t)rintf(r);
   }
 }
 #endif /* LV_HAVE_SSE */
@@ -138,7 +139,7 @@ static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, co
       r = min_val;
     else if(r > max_val)
       r = max_val;
-    *outputVectorPtr++ = (int16_t)(r);
+    *outputVectorPtr++ = (int16_t)rintf(r);
   }
 }
 #endif /* LV_HAVE_GENERIC */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h
index f339a7d10..f58158041 100644
--- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h
@@ -3,6 +3,7 @@
 
 #include <inttypes.h>
 #include <stdio.h>
+#include <math.h>
 
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>
@@ -57,7 +58,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const
       r = max_val;
     else if(r < min_val)
       r = min_val;
-    outputVector[number] = (int16_t)(r);
+    outputVector[number] = (int16_t)rintf(r);
   }
 }
 #endif /* LV_HAVE_SSE2 */
@@ -99,10 +100,10 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const
     ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
 
     _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
   }
 
   number = quarterPoints * 4;    
@@ -112,7 +113,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const
       r = max_val;
     else if(r < min_val)
       r = min_val;
-    outputVector[number] = (int16_t)(r);
+    outputVector[number] = (int16_t)rintf(r);
   }
 }
 #endif /* LV_HAVE_SSE */
@@ -140,7 +141,7 @@ static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, co
       r = max_val;
     else if(r < min_val)
       r = min_val;
-    *outputVectorPtr++ = (int16_t)(r);
+    *outputVectorPtr++ = (int16_t)rintf(r);
   }
 }
 #endif /* LV_HAVE_GENERIC */
-- 
cgit 


From 4f0add175f8e44922c15ed82d644597eceb65fb6 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Sat, 3 Mar 2012 10:11:28 -0500
Subject: volk: include config.h to have rintf in windows/msvc.

---
 volk/include/volk/volk_32f_s32f_convert_16i_a.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'volk/include')

diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h
index a24959678..c2a07398f 100644
--- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h
@@ -1,6 +1,10 @@
 #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
 #define INCLUDED_volk_32f_s32f_convert_16i_a_H
 
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
 #include <volk/volk_common.h>
 #include <inttypes.h>
 #include <stdio.h>
-- 
cgit