diff options
author | Johnathan Corgan | 2012-03-01 16:37:58 -0800 |
---|---|---|
committer | Johnathan Corgan | 2012-03-01 16:37:58 -0800 |
commit | 0a6f1494b8e462c13c8c48c69680832967ebec72 (patch) | |
tree | 4266dba3cd7b8ef50791859761ec645010ed571e /volk/include | |
parent | 088ca3e5ff1ea0c28bbc2f674382137d8b84fe51 (diff) | |
parent | e8d644872837f4cbfc05851710531b2ac5259806 (diff) | |
download | gnuradio-0a6f1494b8e462c13c8c48c69680832967ebec72.tar.gz gnuradio-0a6f1494b8e462c13c8c48c69680832967ebec72.tar.bz2 gnuradio-0a6f1494b8e462c13c8c48c69680832967ebec72.zip |
Merge remote branch 'tom/safe_align'
Diffstat (limited to 'volk/include')
24 files changed, 1578 insertions, 68 deletions
diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index e7333a015..a01ddf193 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -53,9 +53,14 @@ volkinclude_HEADERS = \ volk_16u_byteswap_a.h \ volk_32f_accumulator_s32f_a.h \ volk_32f_x2_add_32f_a.h \ + volk_32f_x2_add_32f_u.h \ volk_32f_s32f_multiply_32f_a.h \ + volk_32f_s32f_multiply_32f_u.h \ volk_32fc_32f_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_a.h \ + volk_32fc_s32fc_multiply_32fc_u.h \ + volk_32fc_x2_multiply_conjugate_32fc_a.h \ + volk_32fc_x2_multiply_conjugate_32fc_u.h \ volk_32fc_s32f_power_32fc_a.h \ volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \ volk_32fc_s32f_atan2_32f_a.h \ @@ -65,13 +70,18 @@ volkinclude_HEADERS = \ volk_32fc_deinterleave_64f_x2_a.h \ volk_32fc_s32f_deinterleave_real_16i_a.h \ volk_32fc_deinterleave_real_32f_a.h \ + volk_32fc_deinterleave_imag_32f_a.h \ volk_32fc_deinterleave_real_64f_a.h \ volk_32fc_x2_dot_prod_32fc_a.h \ volk_32fc_x2_dot_prod_32fc_u.h \ volk_32fc_index_max_16u_a.h \ volk_32fc_s32f_magnitude_16i_a.h \ volk_32fc_magnitude_32f_a.h \ + volk_32fc_magnitude_32f_u.h \ + volk_32fc_magnitude_squared_32f_a.h \ + volk_32fc_magnitude_squared_32f_u.h \ volk_32fc_x2_multiply_32fc_a.h \ + volk_32fc_x2_multiply_32fc_u.h \ volk_32f_s32f_convert_16i_a.h \ volk_32f_s32f_convert_16i_u.h \ volk_32f_s32f_convert_32i_a.h \ @@ -94,6 +104,7 @@ volkinclude_HEADERS = \ volk_32f_x2_max_32f_a.h \ volk_32f_x2_min_32f_a.h \ volk_32f_x2_multiply_32f_a.h \ + volk_32f_x2_multiply_32f_u.h \ volk_32f_s32f_normalize_a.h \ volk_32f_s32f_power_32f_a.h \ volk_32f_sqrt_32f_a.h \ @@ -123,4 +134,7 @@ volkinclude_HEADERS = \ volk_8i_convert_16i_a.h \ volk_8i_convert_16i_u.h \ volk_8i_s32f_convert_32f_a.h \ - volk_8i_s32f_convert_32f_u.h + volk_8i_s32f_convert_32f_u.h \ + volk_32fc_conjugate_32fc_a.h \ + volk_32fc_conjugate_32fc_u.h + diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h index 0a2b4f0f2..a24959678 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -4,6 +4,7 @@ #include <volk/volk_common.h> #include <inttypes.h> #include <stdio.h> +#include <math.h> #ifdef LV_HAVE_SSE2 #include <emmintrin.h> @@ -21,17 +22,29 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2; __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < eighthPoints; number++){ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); @@ -40,7 +53,12 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const number = eighthPoints * 8; for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE2 */ @@ -61,8 +79,15 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -70,18 +95,24 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const ret = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); } number = quarterPoints * 4; for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE */ @@ -98,9 +129,17 @@ static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, co int16_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -32768; + float max_val = 32767; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r < min_val) + r = min_val; + else if(r > max_val) + r = max_val; + *outputVectorPtr++ = (int16_t)rintf(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h index dec3f1611..f58158041 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h @@ -3,6 +3,7 @@ #include <inttypes.h> #include <stdio.h> +#include <math.h> #ifdef LV_HAVE_SSE2 #include <emmintrin.h> @@ -21,17 +22,29 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2; __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < eighthPoints; number++){ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); @@ -40,7 +53,12 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const number = eighthPoints * 8; for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE2 */ @@ -62,8 +80,15 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -71,18 +96,24 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const ret = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); } number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE */ @@ -100,9 +131,17 @@ static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, co int16_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -32768; + float max_val = 32767; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int16_t)rintf(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h index aa370e614..8f2fc791e 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h @@ -21,14 +21,22 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483648; + float max_val = 2147483647; + float r; + __m256 vScalar = _mm256_set1_ps(scalar); __m256 inputVal1; __m256i intInputVal1; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); for(;number < eighthPoints; number++){ inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - intInputVal1 = _mm256_cvtps_epi32(_mm256_mul_ps(inputVal1, vScalar)); + inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm256_cvtps_epi32(inputVal1); _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); outputVectorPtr += 8; @@ -36,7 +44,12 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const number = eighthPoints * 8; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_AVX */ @@ -57,14 +70,22 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483648; + float max_val = 2147483647; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1; __m128i intInputVal1; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < quarterPoints; number++){ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm_cvtps_epi32(inputVal1); _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); outputVectorPtr += 4; @@ -72,7 +93,12 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -93,8 +119,15 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -102,7 +135,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const ret = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); @@ -113,7 +146,12 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -130,9 +168,17 @@ static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, co int32_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -2147483647; + float max_val = 2147483647; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int32_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h index b4e954dc4..d8493454b 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h @@ -21,14 +21,24 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + //float min_val = -2147483647; + //float max_val = 2147483647; + float min_val = -2146400000; + float max_val = 2146400000; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1; __m128i intInputVal1; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < quarterPoints; number++){ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm_cvtps_epi32(inputVal1); _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); outputVectorPtr += 4; @@ -36,7 +46,12 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -58,8 +73,15 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -67,7 +89,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const ret = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); @@ -78,7 +100,12 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -96,9 +123,17 @@ static inline void volk_32f_s32f_convert_32i_u_generic(int32_t* outputVector, co int32_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -2147483647; + float max_val = 2147483647; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int32_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h index 8d87a07d7..05172171c 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a.h @@ -21,9 +21,16 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2, inputVal3, inputVal4; __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < sixteenthPoints; number++){ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; @@ -31,10 +38,15 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(inputVal1); + intInputVal2 = _mm_cvtps_epi32(inputVal2); + intInputVal3 = _mm_cvtps_epi32(inputVal3); + intInputVal4 = _mm_cvtps_epi32(inputVal4); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); @@ -47,7 +59,12 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f number = sixteenthPoints * 16; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int8_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -67,9 +84,16 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl const unsigned int quarterPoints = num_points / 4; const float* inputVectorPtr = (const float*)inputVector; + + float min_val = -128; + float max_val = 127; + float r; + int8_t* outputVectorPtr = outputVector; __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -77,7 +101,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl ret = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); @@ -88,7 +112,12 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int8_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -105,9 +134,17 @@ static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, cons int8_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -128; + float max_val = 127; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = (int8_t)(*inputVectorPtr++ * scalar); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int8_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h index 1c6bf87c9..12991e9c1 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h @@ -21,9 +21,16 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2, inputVal3, inputVal4; __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < sixteenthPoints; number++){ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; @@ -31,10 +38,15 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(inputVal1); + intInputVal2 = _mm_cvtps_epi32(inputVal2); + intInputVal3 = _mm_cvtps_epi32(inputVal3); + intInputVal4 = _mm_cvtps_epi32(inputVal4); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); @@ -47,7 +59,12 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f number = sixteenthPoints * 16; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -69,8 +86,15 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -78,7 +102,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl ret = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); @@ -89,7 +113,12 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -107,9 +136,17 @@ static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, cons int8_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -128; + float max_val = 127; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int16_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h index 37223dc81..d1c6f3f65 100644 --- a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h @@ -4,6 +4,81 @@ #include <inttypes.h> #include <stdio.h> +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, bVal, cVal; + bVal = _mm_set_ps1(scalar); + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, bVal, cVal; + bVal = _mm256_set1_ps(scalar); + for(;number < eighthPoints; number++){ + + aVal = _mm256_load_ps(aPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_GENERIC /*! \brief Scalar float multiply diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h new file mode 100644 index 000000000..0e700060f --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h @@ -0,0 +1,102 @@ +#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H +#define INCLUDED_volk_32f_s32f_multiply_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, bVal, cVal; + bVal = _mm_set_ps1(scalar); + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, bVal, cVal; + bVal = _mm256_set1_ps(scalar); + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* inputPtr = aVector; + float* outputPtr = cVector; + for(number = 0; number < num_points; number++){ + *outputPtr = (*inputPtr) * scalar; + inputPtr++; + outputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */ diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h new file mode 100644 index 000000000..e360a7958 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_add_32f_u.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_32f_x2_add_32f_u_H +#define INCLUDED_volk_32f_x2_add_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_u.h b/volk/include/volk/volk_32f_x2_multiply_32f_u.h new file mode 100644 index 000000000..6c3ce5d83 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_multiply_32f_u.h @@ -0,0 +1,106 @@ +#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H +#define INCLUDED_volk_32f_x2_multiply_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Multiplies the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */ diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_conjugate_32fc_a.h new file mode 100644 index 000000000..1518af9be --- /dev/null +++ b/volk/include/volk/volk_32fc_conjugate_32fc_a.h @@ -0,0 +1,64 @@ +#ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H +#define INCLUDED_volk_32fc_conjugate_32fc_a_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi + + x = _mm_xor_ps(x, conjugator); // conjugate register + + _mm_store_ps((float*)c,x); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = lv_conj(*a); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_conj(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_conjugate_32fc_u.h new file mode 100644 index 000000000..b26fe0789 --- /dev/null +++ b/volk/include/volk/volk_32fc_conjugate_32fc_u.h @@ -0,0 +1,64 @@ +#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H +#define INCLUDED_volk_32fc_conjugate_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi + + x = _mm_xor_ps(x, conjugator); // conjugate register + + _mm_storeu_ps((float*)c,x); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = lv_conj(*a); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_conj(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h new file mode 100644 index 000000000..adc4112b9 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h @@ -0,0 +1,68 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_imag_32f_a_H +#define INCLUDED_volk_32fc_deinterleave_imag_32f_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Deinterleaves the complex vector into Q vector data + \param complexVector The complex input vector + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + float* qBufferPtr = qBuffer; + + __m128 cplxValue1, cplxValue2, iValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in q1q2q3q4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(qBufferPtr, iValue); + + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into Q vector data + \param complexVector The complex input vector + \param qBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_imag_32f_a_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + float* qBufferPtr = qBuffer; + for(number = 0; number < num_points; number++){ + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_u.h b/volk/include/volk/volk_32fc_magnitude_32f_u.h new file mode 100644 index 000000000..ed1cedef9 --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_32f_u.h @@ -0,0 +1,118 @@ +#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H +#define INCLUDED_volk_32fc_magnitude_32f_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <math.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h new file mode 100644 index 000000000..00bdefbb5 --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h @@ -0,0 +1,114 @@ +#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H +#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H + +#include <inttypes.h> +#include <stdio.h> +#include <math.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h new file mode 100644 index 000000000..6eb4a523a --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h @@ -0,0 +1,114 @@ +#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H +#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <math.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h index b27a7259f..534dc2a25 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h @@ -6,7 +6,8 @@ #include <volk/volk_complex.h> #include <float.h> -#ifdef LV_HAVE_GENERIC +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> /*! \brief Multiplies the two input complex vectors and stores their results in the third vector \param cVector The vector where the results will be stored @@ -14,18 +15,44 @@ \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; +static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * scalar; + __m128 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm_set_ps1(lv_creal(scalar)); + yh = _mm_set_ps1(lv_cimag(scalar)); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * scalar; } } -#endif /* LV_HAVE_GENERIC */ +#endif /* LV_HAVE_SSE */ -#ifdef LV_HAVE_ORC + +#ifdef LV_HAVE_GENERIC /*! \brief Multiplies the two input complex vectors and stores their results in the third vector \param cVector The vector where the results will be stored @@ -33,11 +60,29 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, c \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -extern void volk_32fc_s32fc_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points); -static inline void volk_32fc_s32fc_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - volk_32fc_s32fc_multiply_32fc_a_orc_impl(cVector, aVector, scalar, num_points); +static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = num_points; + + // unwrap loop + while (number >= 8){ + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; + } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * scalar; } -#endif /* LV_HAVE_ORC */ +#endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h new file mode 100644 index 000000000..218c450f8 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h @@ -0,0 +1,87 @@ +#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H +#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm_set_ps1(lv_creal(scalar)); + yh = _mm_set_ps1(lv_cimag(scalar)); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = num_points; + + // unwrap loop + while (number >= 8){ + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; + } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * scalar; +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h index 18dd092e8..aec8bd716 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h @@ -23,7 +23,6 @@ static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const l lv_32fc_t* c = cVector; const lv_32fc_t* a = aVector; const lv_32fc_t* b = bVector; - for(;number < halfPoints; number++){ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h new file mode 100644 index 000000000..729c1a4ad --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H +#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h new file mode 100644 index 000000000..2a1bcbce0 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H +#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + y = _mm_xor_ps(y, conjugator); // conjugate y + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h new file mode 100644 index 000000000..92f6a051e --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H +#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + y = _mm_xor_ps(y, conjugator); // conjugate y + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */ diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h index bdaa98643..4683f1e38 100644 --- a/volk/include/volk/volk_64u_popcnt_a.h +++ b/volk/include/volk/volk_64u_popcnt_a.h @@ -10,10 +10,11 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) { - const uint32_t* valueVector = (const uint32_t*)&value; + //const uint32_t* valueVector = (const uint32_t*)&value; // This is faster than a lookup table - uint32_t retVal = valueVector[0]; + //uint32_t retVal = valueVector[0]; + uint32_t retVal = (uint32_t)(value && 0x00000000FFFFFFFF); retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); @@ -22,7 +23,8 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value retVal = (retVal + (retVal >> 16)) & 0x0000003F; uint64_t retVal64 = retVal; - retVal = valueVector[1]; + //retVal = valueVector[1]; + retVal = (uint32_t)((value && 0xFFFFFFFF00000000) >> 31); retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; |