From d825bb2bc1c7bc4105723ae5f699f283bc4f3c44 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 26 Jan 2012 17:53:49 -0500 Subject: volk: float_to_short now clips the values instead of wrapping around. --- volk/include/volk/volk_32f_s32f_convert_16i_a.h | 52 +++++++++++++++++++++---- volk/include/volk/volk_32f_s32f_convert_16i_u.h | 52 +++++++++++++++++++++---- 2 files changed, 90 insertions(+), 14 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h index 0a2b4f0f2..10c921b08 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -21,17 +21,29 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2; __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < eighthPoints; number++){ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); @@ -40,7 +52,12 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const number = eighthPoints * 8; for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -61,8 +78,15 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -70,7 +94,8 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const ret = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); @@ -81,7 +106,12 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -98,9 +128,17 @@ static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, co int16_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -32768; + float max_val = 32767; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r < min_val) + r = min_val; + else if(r > max_val) + r = max_val; + *outputVectorPtr++ = (int16_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h index dec3f1611..f339a7d10 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h @@ -21,17 +21,29 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2; __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < eighthPoints; number++){ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); @@ -40,7 +52,12 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const number = eighthPoints * 8; for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -62,8 +79,15 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -71,7 +95,8 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const ret = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); @@ -82,7 +107,12 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -100,9 +130,17 @@ static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, co int16_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -32768; + float max_val = 32767; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int16_t)(r); } } #endif /* LV_HAVE_GENERIC */ -- cgit From d8b02979cef097971bc0656b904f7b51d19b03c9 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 26 Jan 2012 20:07:26 -0500 Subject: volk: fix a warning. --- volk/include/volk/volk_64u_popcnt_a.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h index bdaa98643..4683f1e38 100644 --- a/volk/include/volk/volk_64u_popcnt_a.h +++ b/volk/include/volk/volk_64u_popcnt_a.h @@ -10,10 +10,11 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) { - const uint32_t* valueVector = (const uint32_t*)&value; + //const uint32_t* valueVector = (const uint32_t*)&value; // This is faster than a lookup table - uint32_t retVal = valueVector[0]; + //uint32_t retVal = valueVector[0]; + uint32_t retVal = (uint32_t)(value && 0x00000000FFFFFFFF); retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); @@ -22,7 +23,8 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value retVal = (retVal + (retVal >> 16)) & 0x0000003F; uint64_t retVal64 = retVal; - retVal = valueVector[1]; + //retVal = valueVector[1]; + retVal = (uint32_t)((value && 0xFFFFFFFF00000000) >> 31); retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; -- cgit From 42d9560a50bbbab143b48bda5a73a5379818ddbe Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 26 Jan 2012 20:07:51 -0500 Subject: volk: float_to_int and float_to_char updated to clip instead of wrap around. The float to int clips at smaller than 2^32 because of the limits of the float representation. --- volk/include/volk/volk_32f_s32f_convert_32i_a.h | 60 ++++++++++++++++++++++--- volk/include/volk/volk_32f_s32f_convert_32i_u.h | 45 ++++++++++++++++--- volk/include/volk/volk_32f_s32f_convert_8i_a.h | 53 ++++++++++++++++++---- volk/include/volk/volk_32f_s32f_convert_8i_u.h | 53 ++++++++++++++++++---- 4 files changed, 183 insertions(+), 28 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h index aa370e614..15fa282fb 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h @@ -21,14 +21,22 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + __m256 vScalar = _mm256_set1_ps(scalar); __m256 inputVal1; __m256i intInputVal1; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); for(;number < eighthPoints; number++){ inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - intInputVal1 = _mm256_cvtps_epi32(_mm256_mul_ps(inputVal1, vScalar)); + inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm256_cvtps_epi32(inputVal1); _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); outputVectorPtr += 8; @@ -36,7 +44,12 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const number = eighthPoints * 8; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_AVX */ @@ -57,14 +70,22 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1; __m128i intInputVal1; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < quarterPoints; number++){ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm_cvtps_epi32(inputVal1); _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); outputVectorPtr += 4; @@ -72,7 +93,12 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -93,8 +119,15 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -102,7 +135,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const ret = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); @@ -113,7 +146,12 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -130,9 +168,17 @@ static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, co int32_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -2147483647; + float max_val = 2147483647; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int32_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h index b4e954dc4..d8493454b 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h @@ -21,14 +21,24 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + //float min_val = -2147483647; + //float max_val = 2147483647; + float min_val = -2146400000; + float max_val = 2146400000; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1; __m128i intInputVal1; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < quarterPoints; number++){ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm_cvtps_epi32(inputVal1); _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); outputVectorPtr += 4; @@ -36,7 +46,12 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -58,8 +73,15 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -67,7 +89,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const ret = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); @@ -78,7 +100,12 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -96,9 +123,17 @@ static inline void volk_32f_s32f_convert_32i_u_generic(int32_t* outputVector, co int32_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -2147483647; + float max_val = 2147483647; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int32_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h index 8d87a07d7..05172171c 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a.h @@ -21,9 +21,16 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2, inputVal3, inputVal4; __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < sixteenthPoints; number++){ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; @@ -31,10 +38,15 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(inputVal1); + intInputVal2 = _mm_cvtps_epi32(inputVal2); + intInputVal3 = _mm_cvtps_epi32(inputVal3); + intInputVal4 = _mm_cvtps_epi32(inputVal4); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); @@ -47,7 +59,12 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f number = sixteenthPoints * 16; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int8_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -67,9 +84,16 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl const unsigned int quarterPoints = num_points / 4; const float* inputVectorPtr = (const float*)inputVector; + + float min_val = -128; + float max_val = 127; + float r; + int8_t* outputVectorPtr = outputVector; __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -77,7 +101,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl ret = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); @@ -88,7 +112,12 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int8_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -105,9 +134,17 @@ static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, cons int8_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -128; + float max_val = 127; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = (int8_t)(*inputVectorPtr++ * scalar); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int8_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h index 1c6bf87c9..12991e9c1 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h @@ -21,9 +21,16 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2, inputVal3, inputVal4; __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < sixteenthPoints; number++){ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; @@ -31,10 +38,15 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(inputVal1); + intInputVal2 = _mm_cvtps_epi32(inputVal2); + intInputVal3 = _mm_cvtps_epi32(inputVal3); + intInputVal4 = _mm_cvtps_epi32(inputVal4); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); @@ -47,7 +59,12 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f number = sixteenthPoints * 16; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -69,8 +86,15 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -78,7 +102,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl ret = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); @@ -89,7 +113,12 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -107,9 +136,17 @@ static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, cons int8_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -128; + float max_val = 127; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int16_t)(r); } } #endif /* LV_HAVE_GENERIC */ -- cgit From d870487437b33d1a26eb8f5f5aaa414ca6ed24e0 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sat, 28 Jan 2012 19:31:33 -0500 Subject: volk: fix lower bound of int conversion. --- volk/include/volk/volk_32f_s32f_convert_32i_a.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h index 15fa282fb..8f2fc791e 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h @@ -22,7 +22,7 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; - float min_val = -2147483647; + float min_val = -2147483648; float max_val = 2147483647; float r; @@ -71,7 +71,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; - float min_val = -2147483647; + float min_val = -2147483648; float max_val = 2147483647; float r; -- cgit From 9c9b9a8aa05c7cde7cd11cd5d1e052630715d252 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sun, 29 Jan 2012 17:31:37 -0500 Subject: volk: added unaligned volk function for magnitude of a complex number. --- volk/include/volk/volk_32fc_magnitude_32f_u.h | 118 ++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 volk/include/volk/volk_32fc_magnitude_32f_u.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_magnitude_32f_u.h b/volk/include/volk/volk_32fc_magnitude_32f_u.h new file mode 100644 index 000000000..ed1cedef9 --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_32f_u.h @@ -0,0 +1,118 @@ +#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H +#define INCLUDED_volk_32fc_magnitude_32f_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ -- cgit From d142fd9e31715bae85d65c4790f278143a784142 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sun, 29 Jan 2012 17:32:30 -0500 Subject: volk: added volk magnitiude squared functions (aligned/unaligned) for complex numbers. --- volk/include/volk/Makefile.am | 3 + .../volk/volk_32fc_magnitude_squared_32f_a.h | 114 +++++++++++++++++++++ .../volk/volk_32fc_magnitude_squared_32f_u.h | 114 +++++++++++++++++++++ 3 files changed, 231 insertions(+) create mode 100644 volk/include/volk/volk_32fc_magnitude_squared_32f_a.h create mode 100644 volk/include/volk/volk_32fc_magnitude_squared_32f_u.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index e7333a015..c2502f6e6 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -71,6 +71,9 @@ volkinclude_HEADERS = \ volk_32fc_index_max_16u_a.h \ volk_32fc_s32f_magnitude_16i_a.h \ volk_32fc_magnitude_32f_a.h \ + volk_32fc_magnitude_32f_u.h \ + volk_32fc_magnitude_squared_32f_a.h \ + volk_32fc_magnitude_squared_32f_u.h \ volk_32fc_x2_multiply_32fc_a.h \ volk_32f_s32f_convert_16i_a.h \ volk_32f_s32f_convert_16i_u.h \ diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h new file mode 100644 index 000000000..00bdefbb5 --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h @@ -0,0 +1,114 @@ +#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H +#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h new file mode 100644 index 000000000..6eb4a523a --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h @@ -0,0 +1,114 @@ +#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H +#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ -- cgit From cb458204245632ac7a571bfe96b90d3c55a2f01a Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Mon, 30 Jan 2012 00:22:44 -0500 Subject: volk: adding complex to imag kernel. --- volk/include/volk/Makefile.am | 1 + .../volk/volk_32fc_deinterleave_imag_32f_a.h | 68 ++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index c2502f6e6..c5b99c41b 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -65,6 +65,7 @@ volkinclude_HEADERS = \ volk_32fc_deinterleave_64f_x2_a.h \ volk_32fc_s32f_deinterleave_real_16i_a.h \ volk_32fc_deinterleave_real_32f_a.h \ + volk_32fc_deinterleave_imag_32f_a.h \ volk_32fc_deinterleave_real_64f_a.h \ volk_32fc_x2_dot_prod_32fc_a.h \ volk_32fc_x2_dot_prod_32fc_u.h \ diff --git a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h new file mode 100644 index 000000000..adc4112b9 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h @@ -0,0 +1,68 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_imag_32f_a_H +#define INCLUDED_volk_32fc_deinterleave_imag_32f_a_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into Q vector data + \param complexVector The complex input vector + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + float* qBufferPtr = qBuffer; + + __m128 cplxValue1, cplxValue2, iValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in q1q2q3q4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(qBufferPtr, iValue); + + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into Q vector data + \param complexVector The complex input vector + \param qBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_imag_32f_a_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + float* qBufferPtr = qBuffer; + for(number = 0; number < num_points; number++){ + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_a_H */ -- cgit From 070a6c9ce93c2f2043a0145e253cc55f801f9433 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 2 Feb 2012 14:25:53 -0500 Subject: volk: adding unaligned versions of complex multiply a constant and complex multiply 2 streams. --- .../include/volk/volk_32fc_s32fc_multiply_32fc_u.h | 42 ++++++++++++ volk/include/volk/volk_32fc_x2_multiply_32fc_u.h | 77 ++++++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h create mode 100644 volk/include/volk/volk_32fc_x2_multiply_32fc_u.h (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h new file mode 100644 index 000000000..201dcf5f6 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h @@ -0,0 +1,42 @@ +#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H +#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = num_points; + + // unwrap loop + while (number >= 8){ + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; + } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * scalar; +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h new file mode 100644 index 000000000..729c1a4ad --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H +#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ -- cgit From 67d23bdecd3f15197bdf46f8c0cd66a6f754fea5 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 2 Feb 2012 16:35:14 -0500 Subject: volk: improving performance of multiply_const and multiply two streams. --- .../include/volk/volk_32fc_s32fc_multiply_32fc_a.h | 63 +++++++++++++++++++++- .../include/volk/volk_32fc_s32fc_multiply_32fc_u.h | 45 ++++++++++++++++ volk/include/volk/volk_32fc_x2_multiply_32fc_a.h | 1 - 3 files changed, 106 insertions(+), 3 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h index b27a7259f..205461afb 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h @@ -6,6 +6,52 @@ #include #include +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm_set_ps1(lv_creal(scalar)); + yh = _mm_set_ps1(lv_cimag(scalar)); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + + #ifdef LV_HAVE_GENERIC /*! \brief Multiplies the two input complex vectors and stores their results in the third vector @@ -17,11 +63,24 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; + unsigned int number = num_points; - for(number = 0; number < num_points; number++){ + // unwrap loop + while (number >= 8){ + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * scalar; } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h index 201dcf5f6..a9dfcda19 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h @@ -6,6 +6,51 @@ #include #include +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm_set_ps1(lv_creal(scalar)); + yh = _mm_set_ps1(lv_cimag(scalar)); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + #ifdef LV_HAVE_GENERIC /*! \brief Multiplies the two input complex vectors and stores their results in the third vector diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h index 18dd092e8..aec8bd716 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h @@ -23,7 +23,6 @@ static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const l lv_32fc_t* c = cVector; const lv_32fc_t* a = aVector; const lv_32fc_t* b = bVector; - for(;number < halfPoints; number++){ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi -- cgit From b3e538493f969526c704a3eb1b3c8d8c61be78f6 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 2 Feb 2012 16:38:36 -0500 Subject: volk: adding new functions to Makefile. --- volk/include/volk/Makefile.am | 2 ++ 1 file changed, 2 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index c5b99c41b..eea006bf2 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -56,6 +56,7 @@ volkinclude_HEADERS = \ volk_32f_s32f_multiply_32f_a.h \ volk_32fc_32f_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_a.h \ + volk_32fc_s32fc_multiply_32fc_u.h \ volk_32fc_s32f_power_32fc_a.h \ volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \ volk_32fc_s32f_atan2_32f_a.h \ @@ -76,6 +77,7 @@ volkinclude_HEADERS = \ volk_32fc_magnitude_squared_32f_a.h \ volk_32fc_magnitude_squared_32f_u.h \ volk_32fc_x2_multiply_32fc_a.h \ + volk_32fc_x2_multiply_32fc_u.h \ volk_32f_s32f_convert_16i_a.h \ volk_32f_s32f_convert_16i_u.h \ volk_32f_s32f_convert_32i_a.h \ -- cgit From ae663decab658be25ac01072fa2f5c8454bd6167 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 2 Feb 2012 17:26:39 -0500 Subject: core: moving multiply_const_ff from gengen to general to take advantage of volk. Also adds SSE and AVX and unaligned Volk versions for this. --- volk/include/volk/volk_32f_s32f_multiply_32f_a.h | 75 ++++++++++++++++++++++ .../include/volk/volk_32fc_s32fc_multiply_32fc_u.h | 12 ++-- 2 files changed, 81 insertions(+), 6 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h index 37223dc81..d1c6f3f65 100644 --- a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h @@ -4,6 +4,81 @@ #include #include +#ifdef LV_HAVE_SSE +#include +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, bVal, cVal; + bVal = _mm_set_ps1(scalar); + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, bVal, cVal; + bVal = _mm256_set1_ps(scalar); + for(;number < eighthPoints; number++){ + + aVal = _mm256_load_ps(aPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_GENERIC /*! \brief Scalar float multiply diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h index a9dfcda19..450a89066 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h @@ -9,10 +9,10 @@ #ifdef LV_HAVE_SSE3 #include /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector + \brief Multiplies the input vector by a scalar and stores the results in the third vector \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ @@ -53,10 +53,10 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, cons #ifdef LV_HAVE_GENERIC /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector + \brief Multiplies the input vector by a scalar and stores the results in the third vector \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ -- cgit From f028a198cb47e0486ad41a29bd3b3dcf0663d766 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sat, 4 Feb 2012 11:04:30 -0500 Subject: volk: new unaligned versions of float multipliers. --- volk/include/volk/Makefile.am | 2 + volk/include/volk/volk_32f_s32f_multiply_32f_u.h | 102 ++++++++++++++++++++ volk/include/volk/volk_32f_x2_multiply_32f_u.h | 106 +++++++++++++++++++++ .../include/volk/volk_32fc_s32fc_multiply_32fc_u.h | 24 ++--- 4 files changed, 222 insertions(+), 12 deletions(-) create mode 100644 volk/include/volk/volk_32f_s32f_multiply_32f_u.h create mode 100644 volk/include/volk/volk_32f_x2_multiply_32f_u.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index eea006bf2..312ff2d5a 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -54,6 +54,7 @@ volkinclude_HEADERS = \ volk_32f_accumulator_s32f_a.h \ volk_32f_x2_add_32f_a.h \ volk_32f_s32f_multiply_32f_a.h \ + volk_32f_s32f_multiply_32f_u.h \ volk_32fc_32f_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_u.h \ @@ -100,6 +101,7 @@ volkinclude_HEADERS = \ volk_32f_x2_max_32f_a.h \ volk_32f_x2_min_32f_a.h \ volk_32f_x2_multiply_32f_a.h \ + volk_32f_x2_multiply_32f_u.h \ volk_32f_s32f_normalize_a.h \ volk_32f_s32f_power_32f_a.h \ volk_32f_sqrt_32f_a.h \ diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h new file mode 100644 index 000000000..0e700060f --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h @@ -0,0 +1,102 @@ +#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H +#define INCLUDED_volk_32f_s32f_multiply_32f_u_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, bVal, cVal; + bVal = _mm_set_ps1(scalar); + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, bVal, cVal; + bVal = _mm256_set1_ps(scalar); + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* inputPtr = aVector; + float* outputPtr = cVector; + for(number = 0; number < num_points; number++){ + *outputPtr = (*inputPtr) * scalar; + inputPtr++; + outputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_u.h b/volk/include/volk/volk_32f_x2_multiply_32f_u.h new file mode 100644 index 000000000..6c3ce5d83 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_multiply_32f_u.h @@ -0,0 +1,106 @@ +#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H +#define INCLUDED_volk_32f_x2_multiply_32f_u_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Multiplies the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */ diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h index 450a89066..218c450f8 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h @@ -8,13 +8,13 @@ #ifdef LV_HAVE_SSE3 #include - /*! +/*! \brief Multiplies the input vector by a scalar and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be multiplied - \param scalar The complex scalar to multiply aVector - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ unsigned int number = 0; const unsigned int halfPoints = num_points / 2; @@ -52,13 +52,13 @@ static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, cons #endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC - /*! +/*! \brief Multiplies the input vector by a scalar and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be multiplied - \param scalar The complex scalar to multiply aVector - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ lv_32fc_t* cPtr = cVector; const lv_32fc_t* aPtr = aVector; -- cgit From 298623615b249de459cd12b5507dab921fc3210b Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Mon, 6 Feb 2012 10:31:28 -0500 Subject: volk: added unaligned version of adding 2 vectors. --- volk/include/volk/Makefile.am | 1 + volk/include/volk/volk_32f_x2_add_32f_u.h | 66 +++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 volk/include/volk/volk_32f_x2_add_32f_u.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 312ff2d5a..20864efbe 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -53,6 +53,7 @@ volkinclude_HEADERS = \ volk_16u_byteswap_a.h \ volk_32f_accumulator_s32f_a.h \ volk_32f_x2_add_32f_a.h \ + volk_32f_x2_add_32f_u.h \ volk_32f_s32f_multiply_32f_a.h \ volk_32f_s32f_multiply_32f_u.h \ volk_32fc_32f_multiply_32fc_a.h \ diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h new file mode 100644 index 000000000..e360a7958 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_add_32f_u.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_32f_x2_add_32f_u_H +#define INCLUDED_volk_32f_x2_add_32f_u_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */ -- cgit From 69210086c7ae98b93a63a1d810ee28b304a13520 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Mon, 6 Feb 2012 22:02:09 -0500 Subject: volk: added a 32fc multiply conjugate kernel. --- volk/include/volk/Makefile.am | 2 + .../volk/volk_32fc_x2_multiply_conjugate_32fc_a.h | 82 ++++++++++++++++++++++ .../volk/volk_32fc_x2_multiply_conjugate_32fc_u.h | 81 +++++++++++++++++++++ 3 files changed, 165 insertions(+) create mode 100644 volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h create mode 100644 volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 20864efbe..d071f18f2 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -59,6 +59,8 @@ volkinclude_HEADERS = \ volk_32fc_32f_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_u.h \ + volk_32fc_s32fc_multiply_conjugate_32fc_a.h \ + volk_32fc_s32fc_multiply_conjugate_32fc_u.h \ volk_32fc_s32f_power_32fc_a.h \ volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \ volk_32fc_s32f_atan2_32f_a.h \ diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h new file mode 100644 index 000000000..70476a8c7 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h @@ -0,0 +1,82 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H +#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m128 conjugator = _mm_setr_ps(1, -1, 1, -1); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + // FIXME: replace with xor for a faster implementation + y = _mm_mul_ps(y, conjugator); // conjugate y + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h new file mode 100644 index 000000000..fbaa29c17 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H +#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m128 conjugator = _mm_set_ps(0, 0x80000000, 0, 0x80000000); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + y = _mm_xor_ps(y, conjugator); // conjugate y + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */ -- cgit From cdb328758dca9fa494956c0e62f5e78adf613982 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Tue, 7 Feb 2012 16:59:50 -0500 Subject: volk: fixed complex multiply and conjugate kernel to use xor for conjugation. --- volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h | 5 ++--- volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h index 70476a8c7..2a1bcbce0 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h @@ -24,15 +24,14 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVecto const lv_32fc_t* a = aVector; const lv_32fc_t* b = bVector; - __m128 conjugator = _mm_setr_ps(1, -1, 1, -1); + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); for(;number < halfPoints; number++){ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - // FIXME: replace with xor for a faster implementation - y = _mm_mul_ps(y, conjugator); // conjugate y + y = _mm_xor_ps(y, conjugator); // conjugate y yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h index fbaa29c17..92f6a051e 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h @@ -24,7 +24,7 @@ static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVecto const lv_32fc_t* a = aVector; const lv_32fc_t* b = bVector; - __m128 conjugator = _mm_set_ps(0, 0x80000000, 0, 0x80000000); + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); for(;number < halfPoints; number++){ -- cgit From 3080cd75a6a10aab757e1e02fb99e81e2f3724d5 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Tue, 7 Feb 2012 17:23:44 -0500 Subject: volk: adding complex conjugate kernel. --- volk/include/volk/Makefile.am | 5 +- volk/include/volk/volk_32fc_conjugate_32fc_a.h | 64 ++++++++++++++++++++++++++ volk/include/volk/volk_32fc_conjugate_32fc_u.h | 64 ++++++++++++++++++++++++++ 3 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 volk/include/volk/volk_32fc_conjugate_32fc_a.h create mode 100644 volk/include/volk/volk_32fc_conjugate_32fc_u.h (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index d071f18f2..f6b5835b1 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -134,4 +134,7 @@ volkinclude_HEADERS = \ volk_8i_convert_16i_a.h \ volk_8i_convert_16i_u.h \ volk_8i_s32f_convert_32f_a.h \ - volk_8i_s32f_convert_32f_u.h + volk_8i_s32f_convert_32f_u.h \ + volk_32fc_conjugate_32fc_a.h \ + volk_32fc_conjugate_32fc_u.h + diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_conjugate_32fc_a.h new file mode 100644 index 000000000..1518af9be --- /dev/null +++ b/volk/include/volk/volk_32fc_conjugate_32fc_a.h @@ -0,0 +1,64 @@ +#ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H +#define INCLUDED_volk_32fc_conjugate_32fc_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi + + x = _mm_xor_ps(x, conjugator); // conjugate register + + _mm_store_ps((float*)c,x); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = lv_conj(*a); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_conj(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_conjugate_32fc_u.h new file mode 100644 index 000000000..b26fe0789 --- /dev/null +++ b/volk/include/volk/volk_32fc_conjugate_32fc_u.h @@ -0,0 +1,64 @@ +#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H +#define INCLUDED_volk_32fc_conjugate_32fc_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi + + x = _mm_xor_ps(x, conjugator); // conjugate register + + _mm_storeu_ps((float*)c,x); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = lv_conj(*a); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_conj(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */ -- cgit From 2eaa0a6e1e57cfc374c258c317ecb469fc49bf53 Mon Sep 17 00:00:00 2001 From: Johnathan Corgan Date: Tue, 14 Feb 2012 15:37:57 -0800 Subject: build: fix autotools for gnuradio-core volkification --- volk/include/volk/Makefile.am | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index f6b5835b1..a01ddf193 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -59,8 +59,8 @@ volkinclude_HEADERS = \ volk_32fc_32f_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_u.h \ - volk_32fc_s32fc_multiply_conjugate_32fc_a.h \ - volk_32fc_s32fc_multiply_conjugate_32fc_u.h \ + volk_32fc_x2_multiply_conjugate_32fc_a.h \ + volk_32fc_x2_multiply_conjugate_32fc_u.h \ volk_32fc_s32f_power_32fc_a.h \ volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \ volk_32fc_s32f_atan2_32f_a.h \ -- cgit From fa8ab7cb146287a9f0d8db67e3126ab4a867a201 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 21 Feb 2012 15:41:27 -0800 Subject: Volk: add scalar const support to the profiler/QA code. Disabled volk_32fc_s32fc_multiply_32fc_a's Orc impl due to it not working. --- volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h index b27a7259f..75cf8c8b2 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h @@ -34,9 +34,9 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, c \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ extern void volk_32fc_s32fc_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points); -static inline void volk_32fc_s32fc_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - volk_32fc_s32fc_multiply_32fc_a_orc_impl(cVector, aVector, scalar, num_points); -} +//static inline void volk_32fc_s32fc_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +// volk_32fc_s32fc_multiply_32fc_a_orc_impl(cVector, aVector, scalar, num_points); +//} #endif /* LV_HAVE_ORC */ -- cgit From 330cddf31208b843c0997c6fb05cb3facf31f536 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 22 Feb 2012 08:46:55 -0800 Subject: Remove ORC invocation since // doesn't dissuade the generator. --- volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h index 75cf8c8b2..665fad47a 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h @@ -25,20 +25,6 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, c } #endif /* LV_HAVE_GENERIC */ -#ifdef LV_HAVE_ORC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -extern void volk_32fc_s32fc_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points); -//static inline void volk_32fc_s32fc_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ -// volk_32fc_s32fc_multiply_32fc_a_orc_impl(cVector, aVector, scalar, num_points); -//} -#endif /* LV_HAVE_ORC */ - -- cgit From e8d644872837f4cbfc05851710531b2ac5259806 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 23 Feb 2012 14:28:21 -0500 Subject: volk: float to short conversion is consistent between archs and tail cases. Rounds to nearest number. --- volk/include/volk/volk_32f_s32f_convert_16i_a.h | 15 ++++++++------- volk/include/volk/volk_32f_s32f_convert_16i_u.h | 15 ++++++++------- 2 files changed, 16 insertions(+), 14 deletions(-) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h index 10c921b08..a24959678 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -4,6 +4,7 @@ #include #include #include +#include #ifdef LV_HAVE_SSE2 #include @@ -57,7 +58,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const r = max_val; else if(r < min_val) r = min_val; - outputVector[number] = (int16_t)(r); + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE2 */ @@ -98,10 +99,10 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); } number = quarterPoints * 4; @@ -111,7 +112,7 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const r = max_val; else if(r < min_val) r = min_val; - outputVector[number] = (int16_t)(r); + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE */ @@ -138,7 +139,7 @@ static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, co r = min_val; else if(r > max_val) r = max_val; - *outputVectorPtr++ = (int16_t)(r); + *outputVectorPtr++ = (int16_t)rintf(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h index f339a7d10..f58158041 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h @@ -3,6 +3,7 @@ #include #include +#include #ifdef LV_HAVE_SSE2 #include @@ -57,7 +58,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const r = max_val; else if(r < min_val) r = min_val; - outputVector[number] = (int16_t)(r); + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE2 */ @@ -99,10 +100,10 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); } number = quarterPoints * 4; @@ -112,7 +113,7 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const r = max_val; else if(r < min_val) r = min_val; - outputVector[number] = (int16_t)(r); + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE */ @@ -140,7 +141,7 @@ static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, co r = max_val; else if(r < min_val) r = min_val; - *outputVectorPtr++ = (int16_t)(r); + *outputVectorPtr++ = (int16_t)rintf(r); } } #endif /* LV_HAVE_GENERIC */ -- cgit From 4f0add175f8e44922c15ed82d644597eceb65fb6 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sat, 3 Mar 2012 10:11:28 -0500 Subject: volk: include config.h to have rintf in windows/msvc. --- volk/include/volk/volk_32f_s32f_convert_16i_a.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'volk/include') diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h index a24959678..c2a07398f 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -1,6 +1,10 @@ #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H #define INCLUDED_volk_32f_s32f_convert_16i_a_H +#ifdef HAVE_CONFIG_H +#include +#endif + #include #include #include -- cgit