diff options
Diffstat (limited to 'volk')
29 files changed, 1650 insertions, 77 deletions
diff --git a/volk/apps/CMakeLists.txt b/volk/apps/CMakeLists.txt index f27bdc126..14291e5e3 100644 --- a/volk/apps/CMakeLists.txt +++ b/volk/apps/CMakeLists.txt @@ -42,4 +42,11 @@ add_executable(volk_profile target_link_libraries(volk_profile volk ${Boost_LIBRARIES}) +install( + PROGRAMS + ${CMAKE_BINARY_DIR}/apps/volk_profile + DESTINATION ${GR_RUNTIME_DIR} + COMPONENT "volk" +) + endif(Boost_FOUND AND UNIX) diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index 10a699872..bd36d6dc7 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -34,6 +34,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32f_accumulator_s32f_a, 1e-4, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32f_x2_add_32f_a, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_x2_add_32f_u, 1e-4, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results); VOLK_PROFILE(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 204600, 50, &results); VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 204600, 1000, &results); @@ -43,13 +44,22 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 204600, 1000, &results); VOLK_PROFILE(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 204600, 1000, &results); VOLK_PROFILE(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_deinterleave_imag_32f_a, 1e-4, 0, 204600, 5000, &results); VOLK_PROFILE(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 204600, 5000, &results); VOLK_PROFILE(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 204600, 1000, &results); VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32fc_index_max_16u_a, 3, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 204600, 100, &results); VOLK_PROFILE(volk_32fc_magnitude_32f_a, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_magnitude_32f_u, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_magnitude_squared_32f_a, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_magnitude_squared_32f_u, 1e-4, 0, 204600, 1000, &results); VOLK_PROFILE(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_x2_multiply_32fc_u, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc_a, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_x2_multiply_conjugate_32fc_u, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_conjugate_32fc_a, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_conjugate_32fc_u, 1e-4, 0, 204600, 1000, &results); VOLK_PROFILE(volk_32f_s32f_convert_16i_a, 1, 32768, 204600, 10000, &results); VOLK_PROFILE(volk_32f_s32f_convert_16i_u, 1, 32768, 204600, 10000, &results); VOLK_PROFILE(volk_32f_s32f_convert_32i_a, 1, 2<<31, 204600, 10000, &results); @@ -72,6 +82,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32f_x2_max_32f_a, 1e-4, 0, 204600, 2000, &results); VOLK_PROFILE(volk_32f_x2_min_32f_a, 1e-4, 0, 204600, 2000, &results); VOLK_PROFILE(volk_32f_x2_multiply_32f_a, 1e-4, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_x2_multiply_32f_u, 1e-4, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32f_s32f_normalize_a, 1e-4, 100, 204600, 10000, &results); VOLK_PROFILE(volk_32f_s32f_power_32f_a, 1e-4, 4, 204600, 100, &results); VOLK_PROFILE(volk_32f_sqrt_32f_a, 1e-4, 0, 204600, 100, &results); @@ -102,8 +113,11 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_8i_convert_16i_u, 0, 0, 204600, 2000, &results); VOLK_PROFILE(volk_8i_s32f_convert_32f_a, 1e-4, 100, 204600, 2000, &results); VOLK_PROFILE(volk_8i_s32f_convert_32f_u, 1e-4, 100, 204600, 2000, &results); - VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results); - VOLK_PROFILE(volk_32f_s32f_multiply_32f_a, 1e-4, 0, 204600, 1000, &results); + //VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc_a, 1e-4, lv_32fc_t(1.0, 0.5), 204600, 1000, &results); + VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32f_s32f_multiply_32f_a, 1e-4, 1.0, 204600, 10000, &results); + VOLK_PROFILE(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 204600, 1000, &results); + char path[256]; get_config_path(path); diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index e7333a015..a01ddf193 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -53,9 +53,14 @@ volkinclude_HEADERS = \ volk_16u_byteswap_a.h \ volk_32f_accumulator_s32f_a.h \ volk_32f_x2_add_32f_a.h \ + volk_32f_x2_add_32f_u.h \ volk_32f_s32f_multiply_32f_a.h \ + volk_32f_s32f_multiply_32f_u.h \ volk_32fc_32f_multiply_32fc_a.h \ volk_32fc_s32fc_multiply_32fc_a.h \ + volk_32fc_s32fc_multiply_32fc_u.h \ + volk_32fc_x2_multiply_conjugate_32fc_a.h \ + volk_32fc_x2_multiply_conjugate_32fc_u.h \ volk_32fc_s32f_power_32fc_a.h \ volk_32f_s32f_calc_spectral_noise_floor_32f_a.h \ volk_32fc_s32f_atan2_32f_a.h \ @@ -65,13 +70,18 @@ volkinclude_HEADERS = \ volk_32fc_deinterleave_64f_x2_a.h \ volk_32fc_s32f_deinterleave_real_16i_a.h \ volk_32fc_deinterleave_real_32f_a.h \ + volk_32fc_deinterleave_imag_32f_a.h \ volk_32fc_deinterleave_real_64f_a.h \ volk_32fc_x2_dot_prod_32fc_a.h \ volk_32fc_x2_dot_prod_32fc_u.h \ volk_32fc_index_max_16u_a.h \ volk_32fc_s32f_magnitude_16i_a.h \ volk_32fc_magnitude_32f_a.h \ + volk_32fc_magnitude_32f_u.h \ + volk_32fc_magnitude_squared_32f_a.h \ + volk_32fc_magnitude_squared_32f_u.h \ volk_32fc_x2_multiply_32fc_a.h \ + volk_32fc_x2_multiply_32fc_u.h \ volk_32f_s32f_convert_16i_a.h \ volk_32f_s32f_convert_16i_u.h \ volk_32f_s32f_convert_32i_a.h \ @@ -94,6 +104,7 @@ volkinclude_HEADERS = \ volk_32f_x2_max_32f_a.h \ volk_32f_x2_min_32f_a.h \ volk_32f_x2_multiply_32f_a.h \ + volk_32f_x2_multiply_32f_u.h \ volk_32f_s32f_normalize_a.h \ volk_32f_s32f_power_32f_a.h \ volk_32f_sqrt_32f_a.h \ @@ -123,4 +134,7 @@ volkinclude_HEADERS = \ volk_8i_convert_16i_a.h \ volk_8i_convert_16i_u.h \ volk_8i_s32f_convert_32f_a.h \ - volk_8i_s32f_convert_32f_u.h + volk_8i_s32f_convert_32f_u.h \ + volk_32fc_conjugate_32fc_a.h \ + volk_32fc_conjugate_32fc_u.h + diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a.h b/volk/include/volk/volk_32f_s32f_convert_16i_a.h index 0a2b4f0f2..c2a07398f 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a.h @@ -1,9 +1,14 @@ #ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H #define INCLUDED_volk_32f_s32f_convert_16i_a_H +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + #include <volk/volk_common.h> #include <inttypes.h> #include <stdio.h> +#include <math.h> #ifdef LV_HAVE_SSE2 #include <emmintrin.h> @@ -21,17 +26,29 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2; __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < eighthPoints; number++){ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); @@ -40,7 +57,12 @@ static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const number = eighthPoints * 8; for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE2 */ @@ -61,8 +83,15 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -70,18 +99,24 @@ static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const ret = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); } number = quarterPoints * 4; for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE */ @@ -98,9 +133,17 @@ static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector, co int16_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -32768; + float max_val = 32767; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r < min_val) + r = min_val; + else if(r > max_val) + r = max_val; + *outputVectorPtr++ = (int16_t)rintf(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h index dec3f1611..f58158041 100644 --- a/volk/include/volk/volk_32f_s32f_convert_16i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h @@ -3,6 +3,7 @@ #include <inttypes.h> #include <stdio.h> +#include <math.h> #ifdef LV_HAVE_SSE2 #include <emmintrin.h> @@ -21,17 +22,29 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2; __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < eighthPoints; number++){ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); @@ -40,7 +53,12 @@ static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const number = eighthPoints * 8; for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE2 */ @@ -62,8 +80,15 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -71,18 +96,24 @@ static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const ret = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); } number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); } } #endif /* LV_HAVE_SSE */ @@ -100,9 +131,17 @@ static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, co int16_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -32768; + float max_val = 32767; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int16_t)rintf(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a.h b/volk/include/volk/volk_32f_s32f_convert_32i_a.h index aa370e614..8f2fc791e 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a.h @@ -21,14 +21,22 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483648; + float max_val = 2147483647; + float r; + __m256 vScalar = _mm256_set1_ps(scalar); __m256 inputVal1; __m256i intInputVal1; + __m256 vmin_val = _mm256_set1_ps(min_val); + __m256 vmax_val = _mm256_set1_ps(max_val); for(;number < eighthPoints; number++){ inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8; - intInputVal1 = _mm256_cvtps_epi32(_mm256_mul_ps(inputVal1, vScalar)); + inputVal1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm256_cvtps_epi32(inputVal1); _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); outputVectorPtr += 8; @@ -36,7 +44,12 @@ static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector, const number = eighthPoints * 8; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_AVX */ @@ -57,14 +70,22 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483648; + float max_val = 2147483647; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1; __m128i intInputVal1; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < quarterPoints; number++){ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm_cvtps_epi32(inputVal1); _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); outputVectorPtr += 4; @@ -72,7 +93,12 @@ static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -93,8 +119,15 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -102,7 +135,7 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const ret = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); @@ -113,7 +146,12 @@ static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -130,9 +168,17 @@ static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector, co int32_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -2147483647; + float max_val = 2147483647; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int32_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h index b4e954dc4..d8493454b 100644 --- a/volk/include/volk/volk_32f_s32f_convert_32i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h @@ -21,14 +21,24 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + //float min_val = -2147483647; + //float max_val = 2147483647; + float min_val = -2146400000; + float max_val = 2146400000; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1; __m128i intInputVal1; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < quarterPoints; number++){ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + intInputVal1 = _mm_cvtps_epi32(inputVal1); _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); outputVectorPtr += 4; @@ -36,7 +46,12 @@ static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -58,8 +73,15 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const const float* inputVectorPtr = (const float*)inputVector; int32_t* outputVectorPtr = outputVector; + + float min_val = -2147483647; + float max_val = 2147483647; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -67,7 +89,7 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const ret = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); @@ -78,7 +100,12 @@ static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int32_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -96,9 +123,17 @@ static inline void volk_32f_s32f_convert_32i_u_generic(int32_t* outputVector, co int32_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -2147483647; + float max_val = 2147483647; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int32_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a.h b/volk/include/volk/volk_32f_s32f_convert_8i_a.h index 8d87a07d7..05172171c 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a.h @@ -21,9 +21,16 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2, inputVal3, inputVal4; __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < sixteenthPoints; number++){ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; @@ -31,10 +38,15 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(inputVal1); + intInputVal2 = _mm_cvtps_epi32(inputVal2); + intInputVal3 = _mm_cvtps_epi32(inputVal3); + intInputVal4 = _mm_cvtps_epi32(inputVal4); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); @@ -47,7 +59,12 @@ static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector, const f number = sixteenthPoints * 16; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int8_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -67,9 +84,16 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl const unsigned int quarterPoints = num_points / 4; const float* inputVectorPtr = (const float*)inputVector; + + float min_val = -128; + float max_val = 127; + float r; + int8_t* outputVectorPtr = outputVector; __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -77,7 +101,7 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl ret = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); @@ -88,7 +112,12 @@ static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector, const fl number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int8_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -105,9 +134,17 @@ static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector, cons int8_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -128; + float max_val = 127; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = (int8_t)(*inputVectorPtr++ * scalar); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int8_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h index 1c6bf87c9..12991e9c1 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_u.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h @@ -21,9 +21,16 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 inputVal1, inputVal2, inputVal3, inputVal4; __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); for(;number < sixteenthPoints; number++){ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; @@ -31,10 +38,15 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + inputVal1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + inputVal2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + inputVal3 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val); + inputVal4 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(inputVal1); + intInputVal2 = _mm_cvtps_epi32(inputVal2); + intInputVal3 = _mm_cvtps_epi32(inputVal3); + intInputVal4 = _mm_cvtps_epi32(inputVal4); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); @@ -47,7 +59,12 @@ static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const f number = sixteenthPoints * 16; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE2 */ @@ -69,8 +86,15 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl const float* inputVectorPtr = (const float*)inputVector; int8_t* outputVectorPtr = outputVector; + + float min_val = -128; + float max_val = 127; + float r; + __m128 vScalar = _mm_set_ps1(scalar); __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; @@ -78,7 +102,7 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl ret = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - ret = _mm_mul_ps(ret, vScalar); + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); _mm_store_ps(outputFloatBuffer, ret); *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); @@ -89,7 +113,12 @@ static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const fl number = quarterPoints * 4; for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)(r); } } #endif /* LV_HAVE_SSE */ @@ -107,9 +136,17 @@ static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, cons int8_t* outputVectorPtr = outputVector; const float* inputVectorPtr = inputVector; unsigned int number = 0; + float min_val = -128; + float max_val = 127; + float r; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int16_t)(r); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h index 37223dc81..d1c6f3f65 100644 --- a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h @@ -4,6 +4,81 @@ #include <inttypes.h> #include <stdio.h> +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, bVal, cVal; + bVal = _mm_set_ps1(scalar); + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, bVal, cVal; + bVal = _mm256_set1_ps(scalar); + for(;number < eighthPoints; number++){ + + aVal = _mm256_load_ps(aPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_GENERIC /*! \brief Scalar float multiply diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_u.h b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h new file mode 100644 index 000000000..0e700060f --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_u.h @@ -0,0 +1,102 @@ +#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H +#define INCLUDED_volk_32f_s32f_multiply_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, bVal, cVal; + bVal = _mm_set_ps1(scalar); + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m256 aVal, bVal, cVal; + bVal = _mm256_set1_ps(scalar); + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_u_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* inputPtr = aVector; + float* outputPtr = cVector; + for(number = 0; number < num_points; number++){ + *outputPtr = (*inputPtr) * scalar; + inputPtr++; + outputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */ diff --git a/volk/include/volk/volk_32f_x2_add_32f_u.h b/volk/include/volk/volk_32f_x2_add_32f_u.h new file mode 100644 index 000000000..e360a7958 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_add_32f_u.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_32f_x2_add_32f_u_H +#define INCLUDED_volk_32f_x2_add_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_u.h b/volk/include/volk/volk_32f_x2_multiply_32f_u.h new file mode 100644 index 000000000..6c3ce5d83 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_multiply_32f_u.h @@ -0,0 +1,106 @@ +#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H +#define INCLUDED_volk_32f_x2_multiply_32f_u_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_AVX +#include <immintrin.h> +/*! + \brief Multiplies the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m256 aVal, bVal, cVal; + for(;number < eighthPoints; number++){ + + aVal = _mm256_loadu_ps(aPtr); + bVal = _mm256_loadu_ps(bPtr); + + cVal = _mm256_mul_ps(aVal, bVal); + + _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 8; + bPtr += 8; + cPtr += 8; + } + + number = eighthPoints * 8; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_u_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */ diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_conjugate_32fc_a.h new file mode 100644 index 000000000..1518af9be --- /dev/null +++ b/volk/include/volk/volk_32fc_conjugate_32fc_a.h @@ -0,0 +1,64 @@ +#ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H +#define INCLUDED_volk_32fc_conjugate_32fc_a_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi + + x = _mm_xor_ps(x, conjugator); // conjugate register + + _mm_store_ps((float*)c,x); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = lv_conj(*a); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_conj(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_conjugate_32fc_u.h new file mode 100644 index 000000000..b26fe0789 --- /dev/null +++ b/volk/include/volk/volk_32fc_conjugate_32fc_u.h @@ -0,0 +1,64 @@ +#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H +#define INCLUDED_volk_32fc_conjugate_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi + + x = _mm_xor_ps(x, conjugator); // conjugate register + + _mm_storeu_ps((float*)c,x); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = lv_conj(*a); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_32fc_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_conj(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h new file mode 100644 index 000000000..adc4112b9 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_imag_32f_a.h @@ -0,0 +1,68 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_imag_32f_a_H +#define INCLUDED_volk_32fc_deinterleave_imag_32f_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> +/*! + \brief Deinterleaves the complex vector into Q vector data + \param complexVector The complex input vector + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + float* qBufferPtr = qBuffer; + + __m128 cplxValue1, cplxValue2, iValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in q1q2q3q4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(qBufferPtr, iValue); + + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into Q vector data + \param complexVector The complex input vector + \param qBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_imag_32f_a_generic(float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + float* qBufferPtr = qBuffer; + for(number = 0; number < num_points; number++){ + complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_u.h b/volk/include/volk/volk_32fc_magnitude_32f_u.h new file mode 100644 index 000000000..ed1cedef9 --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_32f_u.h @@ -0,0 +1,118 @@ +#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H +#define INCLUDED_volk_32fc_magnitude_32f_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <math.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h new file mode 100644 index 000000000..00bdefbb5 --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_a.h @@ -0,0 +1,114 @@ +#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H +#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H + +#include <inttypes.h> +#include <stdio.h> +#include <math.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h new file mode 100644 index 000000000..6eb4a523a --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_squared_32f_u.h @@ -0,0 +1,114 @@ +#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H +#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <math.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include <xmmintrin.h> + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_squared_32f_u_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */ diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h index b27a7259f..534dc2a25 100644 --- a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h @@ -6,7 +6,8 @@ #include <volk/volk_complex.h> #include <float.h> -#ifdef LV_HAVE_GENERIC +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> /*! \brief Multiplies the two input complex vectors and stores their results in the third vector \param cVector The vector where the results will be stored @@ -14,18 +15,44 @@ \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; +static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * scalar; + __m128 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm_set_ps1(lv_creal(scalar)); + yh = _mm_set_ps1(lv_cimag(scalar)); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * scalar; } } -#endif /* LV_HAVE_GENERIC */ +#endif /* LV_HAVE_SSE */ -#ifdef LV_HAVE_ORC + +#ifdef LV_HAVE_GENERIC /*! \brief Multiplies the two input complex vectors and stores their results in the third vector \param cVector The vector where the results will be stored @@ -33,11 +60,29 @@ static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, c \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -extern void volk_32fc_s32fc_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points); -static inline void volk_32fc_s32fc_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - volk_32fc_s32fc_multiply_32fc_a_orc_impl(cVector, aVector, scalar, num_points); +static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = num_points; + + // unwrap loop + while (number >= 8){ + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; + } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * scalar; } -#endif /* LV_HAVE_ORC */ +#endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h new file mode 100644 index 000000000..218c450f8 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_u.h @@ -0,0 +1,87 @@ +#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H +#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm_set_ps1(lv_creal(scalar)); + yh = _mm_set_ps1(lv_cimag(scalar)); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32fc_s32fc_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = num_points; + + // unwrap loop + while (number >= 8){ + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; + } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * scalar; +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h index 18dd092e8..aec8bd716 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a.h @@ -23,7 +23,6 @@ static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const l lv_32fc_t* c = cVector; const lv_32fc_t* a = aVector; const lv_32fc_t* b = bVector; - for(;number < halfPoints; number++){ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h new file mode 100644 index 000000000..729c1a4ad --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_u.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H +#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h new file mode 100644 index 000000000..2a1bcbce0 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_a.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H +#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + y = _mm_xor_ps(y, conjugator); // conjugate y + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h new file mode 100644 index 000000000..92f6a051e --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_conjugate_32fc_u.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H +#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_SSE3 +#include <pmmintrin.h> + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + y = _mm_xor_ps(y, conjugator); // conjugate y + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * lv_conj(*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies vector a by the conjugate of vector b and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector First vector to be multiplied + \param bVector Second vector that is conjugated before being multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_conjugate_32fc_u_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * lv_conj(*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */ diff --git a/volk/include/volk/volk_64u_popcnt_a.h b/volk/include/volk/volk_64u_popcnt_a.h index bdaa98643..4683f1e38 100644 --- a/volk/include/volk/volk_64u_popcnt_a.h +++ b/volk/include/volk/volk_64u_popcnt_a.h @@ -10,10 +10,11 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value) { - const uint32_t* valueVector = (const uint32_t*)&value; + //const uint32_t* valueVector = (const uint32_t*)&value; // This is faster than a lookup table - uint32_t retVal = valueVector[0]; + //uint32_t retVal = valueVector[0]; + uint32_t retVal = (uint32_t)(value && 0x00000000FFFFFFFF); retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); @@ -22,7 +23,8 @@ static inline void volk_64u_popcnt_a_generic(uint64_t* ret, const uint64_t value retVal = (retVal + (retVal >> 16)) & 0x0000003F; uint64_t retVal64 = retVal; - retVal = valueVector[1]; + //retVal = valueVector[1]; + retVal = (uint32_t)((value && 0xFFFFFFFF00000000) >> 31); retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index 9bb515e9f..bb37801c9 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -198,6 +198,18 @@ inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector<void *> &buf while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); } +inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +} + template <class t> bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { bool fail = false; @@ -246,7 +258,7 @@ bool run_volk_tests(struct volk_func_desc desc, void (*manual_func)(), std::string name, float tol, - float scalar, + lv_32fc_t scalar, int vlen, int iter, std::vector<std::string> *best_arch_vector = 0 @@ -316,21 +328,33 @@ bool run_volk_tests(struct volk_func_desc desc, if(inputsc.size() == 0) { run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + if(inputsc[0].is_complex) { + run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } } else throw "unsupported 1 arg function >1 scalars"; break; case 2: if(inputsc.size() == 0) { run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + if(inputsc[0].is_complex) { + run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } } else throw "unsupported 2 arg function >1 scalars"; break; case 3: if(inputsc.size() == 0) { run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + if(inputsc[0].is_complex) { + run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } } else throw "unsupported 3 arg function >1 scalars"; break; case 4: diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h index a1bc1f20c..b998df852 100644 --- a/volk/lib/qa_utils.h +++ b/volk/lib/qa_utils.h @@ -21,7 +21,7 @@ volk_type_t volk_type_from_string(std::string); float uniform(void); void random_floats(float *buf, unsigned n); -bool run_volk_tests(struct volk_func_desc, void(*)(), std::string, float, float, int, int, std::vector<std::string> *); +bool run_volk_tests(struct volk_func_desc, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector<std::string> *); #define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0), 0); } #define VOLK_PROFILE(func, tol, scalar, len, iter, results) run_volk_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results) @@ -32,5 +32,8 @@ typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*); typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*); +typedef void (*volk_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input +typedef void (*volk_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*); +typedef void (*volk_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*); #endif //VOLK_QA_UTILS_H diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index fbd4bdea5..593087f85 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -22,6 +22,7 @@ VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1); VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_add_32f_u, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 20460, 1); @@ -37,7 +38,6 @@ VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 1); VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1); VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1); VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a, 1, 32768, 20460, 1); VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 20460, 1); VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a, 1, 2<<31, 20460, 1); @@ -59,7 +59,6 @@ VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a, 0, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_max_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_x2_min_32f_a, 1e-4, 0, 20460, 1); -VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 20460, 1); VOLK_RUN_TESTS(volk_32f_s32f_normalize_a, 1e-4, 100, 20460, 1); VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a, 1e-4, 4, 20460, 1); VOLK_RUN_TESTS(volk_32f_sqrt_32f_a, 1e-4, 0, 20460, 1); @@ -90,3 +89,15 @@ VOLK_RUN_TESTS(volk_8i_convert_16i_a, 0, 0, 20460, 1); VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 20460, 1); VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a, 1e-4, 100, 20460, 1); VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_u, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_multiply_conjugate_32fc_u, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_conjugate_32fc_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_conjugate_32fc_u, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_u, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32fc_multiply_32fc_u, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_multiply_32f_u, 1e-4, 0, 20460, 1); |