summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_16i_a.h52
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_16i_u.h52
2 files changed, 90 insertions, 14 deletions
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h
index 0a2b4f0f2..10c921b08 100644
--- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h
@@ -21,17 +21,29 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
__m128 vScalar = _mm_set_ps1(scalar);
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
for(;number < eighthPoints; number++){
inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
- intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
- intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-
+ // Scale and clip
+ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
@@ -40,7 +52,12 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const
number = eighthPoints * 8;
for(; number < num_points; number++){
- *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)(r);
}
}
#endif /* LV_HAVE_SSE2 */
@@ -61,8 +78,15 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
__m128 vScalar = _mm_set_ps1(scalar);
__m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
__VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
@@ -70,7 +94,8 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const
ret = _mm_load_ps(inputVectorPtr);
inputVectorPtr += 4;
- ret = _mm_mul_ps(ret, vScalar);
+ // Scale and clip
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
_mm_store_ps(outputFloatBuffer, ret);
*outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
@@ -81,7 +106,12 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const
number = quarterPoints * 4;
for(; number < num_points; number++){
- *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)(r);
}
}
#endif /* LV_HAVE_SSE */
@@ -98,9 +128,17 @@ static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, co
int16_t* outputVectorPtr = outputVector;
const float* inputVectorPtr = inputVector;
unsigned int number = 0;
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar));
+ r = *inputVectorPtr++ * scalar;
+ if(r < min_val)
+ r = min_val;
+ else if(r > max_val)
+ r = max_val;
+ *outputVectorPtr++ = (int16_t)(r);
}
}
#endif /* LV_HAVE_GENERIC */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h
index dec3f1611..f339a7d10 100644
--- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h
@@ -21,17 +21,29 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
__m128 vScalar = _mm_set_ps1(scalar);
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
+ __m128 ret1, ret2;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
for(;number < eighthPoints; number++){
inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
- intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
- intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-
+ // Scale and clip
+ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+ intInputVal1 = _mm_cvtps_epi32(ret1);
+ intInputVal2 = _mm_cvtps_epi32(ret2);
+
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
@@ -40,7 +52,12 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const
number = eighthPoints * 8;
for(; number < num_points; number++){
- outputVector[number] = (int16_t)(inputVector[number] * scalar);
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)(r);
}
}
#endif /* LV_HAVE_SSE2 */
@@ -62,8 +79,15 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
__m128 vScalar = _mm_set_ps1(scalar);
__m128 ret;
+ __m128 vmin_val = _mm_set_ps1(min_val);
+ __m128 vmax_val = _mm_set_ps1(max_val);
__VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
@@ -71,7 +95,8 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const
ret = _mm_loadu_ps(inputVectorPtr);
inputVectorPtr += 4;
- ret = _mm_mul_ps(ret, vScalar);
+ // Scale and clip
+ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
_mm_store_ps(outputFloatBuffer, ret);
*outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
@@ -82,7 +107,12 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const
number = quarterPoints * 4;
for(; number < num_points; number++){
- outputVector[number] = (int16_t)(inputVector[number] * scalar);
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)(r);
}
}
#endif /* LV_HAVE_SSE */
@@ -100,9 +130,17 @@ static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, co
int16_t* outputVectorPtr = outputVector;
const float* inputVectorPtr = inputVector;
unsigned int number = 0;
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
for(number = 0; number < num_points; number++){
- *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar));
+ r = *inputVectorPtr++ * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ *outputVectorPtr++ = (int16_t)(r);
}
}
#endif /* LV_HAVE_GENERIC */