diff options
Diffstat (limited to 'volk')
19 files changed, 356 insertions, 110 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc index b9ac9ecc2..10a699872 100644 --- a/volk/apps/volk_profile.cc +++ b/volk/apps/volk_profile.cc @@ -27,8 +27,8 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 204600, 10000, &results); VOLK_PROFILE(volk_16i_convert_8i_a, 0, 0, 204600, 10000, &results); VOLK_PROFILE(volk_16i_convert_8i_u, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_16i_max_star_16i_a, 0, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_16i_max_star_horizontal_16i_a, 0, 0, 204600, 10000, &results); + //VOLK_PROFILE(volk_16i_max_star_16i_a, 0, 0, 204600, 10000, &results); + //VOLK_PROFILE(volk_16i_max_star_horizontal_16i_a, 0, 0, 204600, 10000, &results); //VOLK_PROFILE(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 10000, &results); //VOLK_PROFILE(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 10000, &results); VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results); @@ -46,7 +46,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 204600, 5000, &results); VOLK_PROFILE(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 204600, 1000, &results); VOLK_PROFILE(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results); - VOLK_PROFILE(volk_32fc_index_max_16u_a, 0, 0, 204600, 10000, &results); + VOLK_PROFILE(volk_32fc_index_max_16u_a, 3, 0, 204600, 10000, &results); VOLK_PROFILE(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 204600, 100, &results); VOLK_PROFILE(volk_32fc_magnitude_32f_a, 1e-4, 0, 204600, 1000, &results); VOLK_PROFILE(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results); @@ -66,7 +66,7 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 5000, &results); VOLK_PROFILE(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 5000, &results); //VOLK_PROFILE(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000, &results); - VOLK_PROFILE(volk_32f_index_max_16u_a, 0, 0, 204600, 5000, &results); + VOLK_PROFILE(volk_32f_index_max_16u_a, 3, 0, 204600, 5000, &results); VOLK_PROFILE(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 204600, 3000, &results); VOLK_PROFILE(volk_32f_x2_interleave_32fc_a, 0, 0, 204600, 5000, &results); VOLK_PROFILE(volk_32f_x2_max_32f_a, 1e-4, 0, 204600, 2000, &results); @@ -102,6 +102,8 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_8i_convert_16i_u, 0, 0, 204600, 2000, &results); VOLK_PROFILE(volk_8i_s32f_convert_32f_a, 1e-4, 100, 204600, 2000, &results); VOLK_PROFILE(volk_8i_s32f_convert_32f_u, 1e-4, 100, 204600, 2000, &results); + VOLK_PROFILE(volk_32fc_s32fc_multiply_32fc_a, 1e-4, 0, 204600, 1000, &results); + VOLK_PROFILE(volk_32f_s32f_multiply_32f_a, 1e-4, 0, 204600, 1000, &results); char path[256]; get_config_path(path); diff --git a/volk/gen/compilers.xml b/volk/gen/compilers.xml index 70c82e555..005eda2aa 100644 --- a/volk/gen/compilers.xml +++ b/volk/gen/compilers.xml @@ -2,9 +2,21 @@ <grammar> <compiler name="MSVC"> + <!-- remap the following flags to SSE --> <remap name="mmmx">arch:SSE</remap> <remap name="msse">arch:SSE</remap> + + <!-- remap the following flags to SSE2 --> <remap name="msse2">arch:SSE2</remap> + + <!-- remap the following flags to AVX --> + <remap name="msse3">arch:AVX</remap> + <remap name="mssse3">arch:AVX</remap> + <remap name="msse4.1">arch:AVX</remap> + <remap name="msse4.2">arch:AVX</remap> + <remap name="mpopcnt">arch:AVX</remap> + <remap name="mavx">arch:AVX</remap> + <prefix>/</prefix> </compiler> @@ -15,4 +27,4 @@ -</grammar>
\ No newline at end of file +</grammar> diff --git a/volk/gen/machines.xml b/volk/gen/machines.xml index b872b9fb1..9c19c91c6 100644 --- a/volk/gen/machines.xml +++ b/volk/gen/machines.xml @@ -15,9 +15,9 @@ --> <!-- -Create an SSE2 only machine (without 64/32 inline assembly support). +Create an SSE2 and AVX only machine (without 64/32 inline assembly support). This machine is intended to support the MSVC compiler on x86/amd64. -The MSVC compiler has intrinsic support for SSE and SSE2, +The MSVC compiler has intrinsic support for SSE, SSE2, AVX however it does not support the gcc style inline assembly. --> @@ -57,6 +57,10 @@ however it does not support the gcc style inline assembly. <archs>generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx</archs> </machine> +<machine name="avx_only"> +<archs>generic mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx</archs> +</machine> + <machine name="altivec"> <archs>generic altivec</archs> </machine> diff --git a/volk/gen/make_cpuid_c.py b/volk/gen/make_cpuid_c.py index 7281f45a3..2be1123a8 100644 --- a/volk/gen/make_cpuid_c.py +++ b/volk/gen/make_cpuid_c.py @@ -39,7 +39,7 @@ struct VOLK_CPU volk_cpu; //implement get cpuid for gcc compilers using a copy of cpuid.h #if defined(__GNUC__) #include <gcc_x86_cpuid.h> -#define cpuid_x86(op, r) __get_cpuid(op, r+0, r+1, r+2, r+3) +#define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r+0, (unsigned int *)r+1, (unsigned int *)r+2, (unsigned int *)r+3) //implement get cpuid for MSVC compilers using __cpuid intrinsic #elif defined(_MSC_VER) diff --git a/volk/gen/volk_regexp.py b/volk/gen/volk_regexp.py index b83ce5206..eb4ceb54b 100644 --- a/volk/gen/volk_regexp.py +++ b/volk/gen/volk_regexp.py @@ -1,5 +1,4 @@ import re -import string remove_after_underscore = re.compile("_.*"); space_remove = re.compile(" "); @@ -10,5 +9,5 @@ replace_volk = re.compile("volk"); def strip_trailing(tostrip, stripstr): lindex = tostrip.rfind(stripstr) - tostrip = tostrip[0:lindex] + string.replace(tostrip[lindex:len(tostrip)], stripstr, ""); + tostrip = tostrip[0:lindex] + tostrip[lindex:len(tostrip)].replace(stripstr, ""); return tostrip diff --git a/volk/gen/volk_register.py b/volk/gen/volk_register.py index cd874e470..0774ece29 100644 --- a/volk/gen/volk_register.py +++ b/volk/gen/volk_register.py @@ -4,7 +4,6 @@ import sys import os import re import glob -import string from xml.dom import minidom from volk_regexp import * from make_cpuid_c import make_cpuid_c @@ -101,7 +100,7 @@ for filearch in filearchs: archs_or = "(" for arch in archs: - archs_or = archs_or + string.upper(arch) + "|"; + archs_or = archs_or + arch.upper() + "|"; archs_or = archs_or[0:len(archs_or)-1]; archs_or = archs_or + ")"; diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h index f60b33a41..a10a62350 100644 --- a/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h +++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a.h @@ -1,6 +1,7 @@ #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H +#include <volk/volk_common.h> #include<inttypes.h> #include<stdio.h> @@ -21,7 +22,7 @@ static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in - volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; __m128i xmm5, xmm6, xmm7, xmm8; xmm4 = _mm_load_si128((__m128i*)shufmask0); @@ -92,8 +93,7 @@ static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, in xmm0 = _mm_shuffle_epi8(xmm0, xmm3); - - _mm_storel_pd((double*)p_target, (__m128d)xmm0); + _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec); p_target = (__m128i*)((int8_t*)p_target + 8); diff --git a/volk/include/volk/volk_32f_s32f_multiply_32f_a.h b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h new file mode 100644 index 000000000..37223dc81 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_multiply_32f_a.h @@ -0,0 +1,44 @@ +#ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H +#define INCLUDED_volk_32f_s32f_multiply_32f_a_H + +#include <inttypes.h> +#include <stdio.h> + +#ifdef LV_HAVE_GENERIC +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* inputPtr = aVector; + float* outputPtr = cVector; + for(number = 0; number < num_points; number++){ + *outputPtr = (*inputPtr) * scalar; + inputPtr++; + outputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Scalar float multiply + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param scalar the scalar value + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst, const float* src, const float scalar, unsigned int num_points); +static inline void volk_32f_s32f_multiply_32f_a_orc(float* cVector, const float* aVector, const float scalar, unsigned int num_points){ + volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points); +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */ diff --git a/volk/include/volk/volk_32fc_index_max_16u_a.h b/volk/include/volk/volk_32fc_index_max_16u_a.h index 9566aa32e..125a34582 100644 --- a/volk/include/volk/volk_32fc_index_max_16u_a.h +++ b/volk/include/volk/volk_32fc_index_max_16u_a.h @@ -87,8 +87,8 @@ static inline void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_ xmm2 = _mm_load_ps((float*)src0); - xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8); - xmm8 = (__m128i)xmm1; + xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); + xmm8 = bit128_p(&xmm1)->int_vec; xmm2 = _mm_mul_ps(xmm2, xmm2); diff --git a/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h new file mode 100644 index 000000000..b27a7259f --- /dev/null +++ b/volk/include/volk/volk_32fc_s32fc_multiply_32fc_a.h @@ -0,0 +1,46 @@ +#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H +#define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H + +#include <inttypes.h> +#include <stdio.h> +#include <volk/volk_complex.h> +#include <float.h> + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_s32fc_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points); +static inline void volk_32fc_s32fc_multiply_32fc_a_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + volk_32fc_s32fc_multiply_32fc_a_orc_impl(cVector, aVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + + + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h index f11c93682..02faf86c2 100644 --- a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -96,9 +96,9 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result in1 = _mm_loadu_ps( (float*) (input+offset) ); in2 = _mm_loadu_ps( (float*) (taps+offset) ); - Rv = in1*in2; + Rv = _mm_mul_ps(in1, in2); fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); - Iv = in1*fehg; + Iv = _mm_mul_ps(in1, fehg); Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv); Ivm = _mm_xor_ps( negMask.vec, Iv ); Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv); @@ -119,9 +119,9 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result in1 = _mm_loadu_ps( (float*) (input+offset) ); in2 = _mm_loadu_ps( (float*) (taps+offset) ); - Rv = _mm_and_ps(in1*in2, halfMask.vec); + Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec); fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); - Iv = _mm_and_ps(in1*fehg, halfMask.vec); + Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec); Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv); Ivm = _mm_xor_ps( negMask.vec, Iv ); Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv); diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h new file mode 100644 index 000000000..7c0dba7fd --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_u.h @@ -0,0 +1,116 @@ +#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H +#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H + +#include <volk/volk_common.h> +#include <volk/volk_complex.h> +#include <stdio.h> +#include <string.h> + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_points/2; + unsigned int isodd = num_points &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + unsigned int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[num_points - 1] * taps[num_points - 1]; + + } + +} + +#endif /*LV_HAVE_GENERIC*/ + +#ifdef LV_HAVE_SSE3 + +#include <pmmintrin.h> + +static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(float)); + + unsigned int number = 0; + const unsigned int halfPoints = num_points/2; + + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; + + dotProdVal = _mm_setzero_ps(); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + + a += 2; + b += 2; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + + _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + + if(num_points % 1 != 0) { + dotProduct += (*a) * (*b); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/ diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h index 0bb76f1d1..0c280eb6e 100644 --- a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a.h @@ -26,8 +26,8 @@ static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVect for(;number < quarterPoints; number++){ // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); + y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); // Calculate the ar*cr - ai*(-ci) portions realz = _mm_madd_epi16(x,y); diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h index 3e05608a4..a2c2b04f6 100644 --- a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a.h @@ -29,8 +29,8 @@ static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* for(;number < quarterPoints; number++){ // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a)); + y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b)); // Calculate the ar*cr - ai*(-ci) portions realz = _mm_madd_epi16(x,y); diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h index 2c935d1fb..38263d5f7 100644 --- a/volk/include/volk/volk_common.h +++ b/volk/include/volk/volk_common.h @@ -91,4 +91,6 @@ union bit128{ #endif }; +#define bit128_p(x) ((union bit128 *)(x)) + #endif /*INCLUDED_LIBVOLK_COMMON_H*/ diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 62e62c2f4..fbd4bdea5 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -4,90 +4,89 @@ //VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4_a, 1e-4, 2046, 10000); //VOLK_RUN_TESTS(volk_16i_branch_4_state_8_a, 1e-4, 2046, 10000); -VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 204600, 10000); -VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a, 0, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a, 0, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a, 1e-4, 32768.0, 20460, 1000); -VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a, 0, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a, 1, 0, 20460, 100); -VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a, 1e-5, 32768.0, 20460, 1000); -VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a, 1e-4, 32768.0, 20460, 10000); -VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 20460, 10000); -VOLK_RUN_TESTS(volk_16i_convert_8i_a, 0, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_16i_max_star_16i_a, 0, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a, 0, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a, 1e-4, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a, 1, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a, 1e-5, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a, 1e-4, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 20460, 1); +VOLK_RUN_TESTS(volk_16i_convert_8i_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1); +//VOLK_RUN_TESTS(volk_16i_max_star_16i_a, 0, 0, 20460, 10000); +//VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a, 0, 0, 20460, 10000); //VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000); //VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000); -VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 20460, 1000); -VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 20460, 50); -VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 20460, 1000); -VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a, 1e-4, 10.0, 20460, 100); +VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a, 1e-4, 10.0, 20460, 1); //VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a, 1e-4, 0, 2046, 10000); -VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 20460, 1000); -VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1000); -VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 10000); -VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 5000); -VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1000); -VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 0, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 100); -VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1000); -VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 20460, 1000); -VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a, 1, 32768, 20460, 10000); -VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 20460, 10000); -VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a, 1, 2<<31, 20460, 10000); -VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 20460, 10000); -VOLK_RUN_TESTS(volk_32f_convert_64f_a, 1e-4, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a, 1, 128, 20460, 10000); -VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 20460, 10000); +VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a, 1, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a, 1, 2<<31, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 20460, 1); +VOLK_RUN_TESTS(volk_32f_convert_64f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a, 1, 128, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 20460, 1); //VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a, 1e-4, 2046, 10000); -VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a, 1e-4, 0, 2046, 100); -VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 10000); -VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 2000); -VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 20460, 5000); -VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 20460, 5000); +VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a, 1e-4, 0, 2046, 1); +VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1); +VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1); //VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000); -VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 0, 0, 20460, 5000); -VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 20460, 3000); -VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a, 0, 0, 20460, 5000); -VOLK_RUN_TESTS(volk_32f_x2_max_32f_a, 1e-4, 0, 20460, 2000); -VOLK_RUN_TESTS(volk_32f_x2_min_32f_a, 1e-4, 0, 20460, 2000); -VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_32f_s32f_normalize_a, 1e-4, 100, 20460, 10000); -VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a, 1e-4, 4, 20460, 100); -VOLK_RUN_TESTS(volk_32f_sqrt_32f_a, 1e-4, 0, 20460, 100); -VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a, 1e-4, 100, 20460, 3000); -VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a, 1e-4, 0, 20460, 3000); -VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a, 1e-4, 0, 20460, 5000); -VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a, 1e-4, 0, 20460, 5000); -VOLK_RUN_TESTS(volk_32i_x2_and_32i_a, 0, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a, 1e-4, 100, 20460, 10000); -VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 20460, 10000); -VOLK_RUN_TESTS(volk_32i_x2_or_32i_a, 0, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_32u_byteswap_a, 0, 0, 20460, 2000); +VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_max_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_min_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_normalize_a, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a, 1e-4, 4, 20460, 1); +VOLK_RUN_TESTS(volk_32f_sqrt_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32i_x2_and_32i_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_32i_x2_or_32i_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_32u_byteswap_a, 0, 0, 20460, 1); //VOLK_RUN_TESTS(volk_32u_popcnt_a, 0, 0, 2046, 10000); -VOLK_RUN_TESTS(volk_64f_convert_32f_a, 1e-4, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_64f_x2_max_64f_a, 1e-4, 0, 20460, 1000); -VOLK_RUN_TESTS(volk_64f_x2_min_64f_a, 1e-4, 0, 20460, 1000); -VOLK_RUN_TESTS(volk_64u_byteswap_a, 0, 0, 20460, 1000); +VOLK_RUN_TESTS(volk_64f_convert_32f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_64f_x2_max_64f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_64f_x2_min_64f_a, 1e-4, 0, 20460, 1); +VOLK_RUN_TESTS(volk_64u_byteswap_a, 0, 0, 20460, 1); //VOLK_RUN_TESTS(volk_64u_popcnt_a, 0, 0, 2046, 10000); -VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a, 0, 0, 20460, 3000); -VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a, 1e-4, 100, 20460, 3000); -VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a, 0, 256, 20460, 3000); -VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a, 1e-4, 100, 20460, 3000); -VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a, 0, 0, 20460, 10000); -VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a, 0, 0, 20460, 400); -VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a, 1e-4, 100, 20460, 400); -VOLK_RUN_TESTS(volk_8i_convert_16i_a, 0, 0, 20460, 20000); -VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 20460, 2000); -VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a, 1e-4, 100, 20460, 2000); -VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 20460, 2000); - +VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a, 0, 256, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_8i_convert_16i_a, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 20460, 1); +VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a, 1e-4, 100, 20460, 1); +VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 20460, 1); diff --git a/volk/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc b/volk/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc new file mode 100644 index 000000000..ea23fc045 --- /dev/null +++ b/volk/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32f_s32f_multiply_32f_a_orc_impl +.dest 4 dst +.source 4 src1 +.floatparam 4 scalar +mulf dst, src1, scalar diff --git a/volk/orc/volk_32fc_s32f_magnitude_16i_a_orc_impl.orc b/volk/orc/volk_32fc_s32f_magnitude_16i_a_orc_impl.orc index 505e73f5d..d3bf78935 100644 --- a/volk/orc/volk_32fc_s32f_magnitude_16i_a_orc_impl.orc +++ b/volk/orc/volk_32fc_s32f_magnitude_16i_a_orc_impl.orc @@ -9,15 +9,15 @@ .temp 4 sumf .temp 4 rootf .temp 4 rootl -.temp 4 maskl +#.temp 4 maskl x2 mulf prodiqf, src, src splitql qf, if, prodiqf addf sumf, if, qf sqrtf rootf, sumf mulf rootf, rootf, scalar -cmpltf maskl, 32768.0, rootf -andl maskl, maskl, 0x80000000 -orl rootf, rootf, maskl +#cmpltf maskl, 32768.0, rootf +#andl maskl, maskl, 0x80000000 +#orl rootf, rootf, maskl convfl rootl, rootf -convssslw dst, rootl +convsuslw dst, rootl diff --git a/volk/orc/volk_32fc_s32fc_multiply_32fc_a_orc_impl.orc b/volk/orc/volk_32fc_s32fc_multiply_32fc_a_orc_impl.orc new file mode 100644 index 000000000..2577e034f --- /dev/null +++ b/volk/orc/volk_32fc_s32fc_multiply_32fc_a_orc_impl.orc @@ -0,0 +1,18 @@ +.function volk_32fc_s32fc_multiply_32fc_a_orc_impl +.source 8 src1 +.floatparam 8 scalar +.dest 8 dst +.temp 8 iqprod +.temp 4 real +.temp 4 imag +.temp 4 ac +.temp 4 bd +.temp 8 swapped +x2 mulf iqprod, src1, scalar +splitql bd, ac, iqprod +subf real, ac, bd +swaplq swapped, src1 +x2 mulf iqprod, swapped, scalar +splitql bd, ac, iqprod +addf imag, ac, bd +mergelq dst, real, imag |