diff options
-rw-r--r-- | volk/gen/archs.xml | 2 | ||||
-rw-r--r-- | volk/gen/make_cpuid_c.py | 30 | ||||
-rw-r--r-- | volk/lib/testqa.cc | 160 |
3 files changed, 109 insertions, 83 deletions
diff --git a/volk/gen/archs.xml b/volk/gen/archs.xml index 661b3f890..f6822871f 100644 --- a/volk/gen/archs.xml +++ b/volk/gen/archs.xml @@ -11,7 +11,7 @@ </arch> <arch name="neon" type="arm"> - <flag>mfpu=neon -funsafe-math-optimizations</flag> + <flag>mfpu=neon -mfloat-abi=softfp -funsafe-math-optimizations</flag> <alignment>16</alignment> </arch> diff --git a/volk/gen/make_cpuid_c.py b/volk/gen/make_cpuid_c.py index 3b2f12d5c..eb88dcd7f 100644 --- a/volk/gen/make_cpuid_c.py +++ b/volk/gen/make_cpuid_c.py @@ -157,9 +157,35 @@ int i_can_has_%s () { elif str(domarch.attributes["type"].value) == "arm": arch = str(domarch.attributes["name"].value); tempstring = tempstring + """\ +#if defined(__arm__) && defined(__linux__) +#include <asm/hwcap.h> +#include <linux/auxvec.h> +#include <stdio.h> +#define LOOK_FOR_NEON +#endif + int i_can_has_%s () { -#ifdef __NEON__ - return 1; +//it's linux-specific, but if you're compiling libvolk for NEON +//on Windows you have other problems + +#ifdef LOOK_FOR_NEON + FILE *auxvec_f; + unsigned long auxvec[2]; + unsigned int found_neon = 0; + auxvec_f = fopen("/proc/self/auxv", "rb"); + if(!auxvec_f) return 0; + + //so auxv is basically 32b of ID and 32b of value + //so it goes like this + while(!found_neon && auxvec_f) { + fread(auxvec, sizeof(unsigned long), 2, auxvec_f); + if((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON)) + found_neon = 1; + } + + fclose(auxvec_f); + return found_neon; + #else return 0; #endif diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 98b09c50f..62e62c2f4 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -5,89 +5,89 @@ //VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4_a, 1e-4, 2046, 10000); //VOLK_RUN_TESTS(volk_16i_branch_4_state_8_a, 1e-4, 2046, 10000); VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a, 1e-5, 32768.0, 204600, 10000); -VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a, 0, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a, 0, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a, 1e-4, 32768.0, 204600, 1000); -VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a, 0, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a, 1, 0, 204600, 100); -VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a, 1e-5, 32768.0, 204600, 1000); -VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a, 1e-4, 32768.0, 204600, 10000); -VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 204600, 10000); -VOLK_RUN_TESTS(volk_16i_convert_8i_a, 0, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_16i_max_star_16i_a, 0, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a, 0, 0, 204600, 10000); -//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 10000); -//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 10000); -VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 204600, 1000); -VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 204600, 50); -VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 204600, 1000); -VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a, 1e-4, 10.0, 204600, 100); +VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a, 0, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a, 0, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a, 1e-4, 32768.0, 20460, 1000); +VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a, 0, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a, 1, 0, 20460, 100); +VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a, 1e-5, 32768.0, 20460, 1000); +VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a, 1e-4, 32768.0, 20460, 10000); +VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 20460, 10000); +VOLK_RUN_TESTS(volk_16i_convert_8i_a, 0, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_16i_max_star_16i_a, 0, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a, 0, 0, 20460, 10000); +//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000); +//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000); +VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a, 1e-4, 0, 20460, 1000); +VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a, 1e-4, 0, 20460, 50); +VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a, 1e-4, 20.0, 20460, 1000); +VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a, 1e-4, 10.0, 20460, 100); //VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a, 1e-4, 0, 2046, 10000); -VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 204600, 1000); -VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 204600, 1000); -VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 204600, 10000); -VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 204600, 5000); -VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 204600, 1000); -VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 0, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 204600, 100); -VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 204600, 1000); -VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 204600, 1000); -VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a, 1, 32768, 204600, 10000); -VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 204600, 10000); -VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a, 1, 2<<31, 204600, 10000); -VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 204600, 10000); -VOLK_RUN_TESTS(volk_32f_convert_64f_a, 1e-4, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a, 1, 128, 204600, 10000); -VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 204600, 10000); +VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a, 1e-4, 0, 20460, 1000); +VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a, 1e-4, 0, 20460, 1000); +VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 10000); +VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 5000); +VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1000); +VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 0, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 100); +VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1000); +VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a, 1e-4, 0, 20460, 1000); +VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a, 1, 32768, 20460, 10000); +VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 20460, 10000); +VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a, 1, 2<<31, 20460, 10000); +VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 20460, 10000); +VOLK_RUN_TESTS(volk_32f_convert_64f_a, 1e-4, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a, 1, 128, 20460, 10000); +VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 20460, 10000); //VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a, 1e-4, 2046, 10000); -VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a, 1e-4, 0, 20460, 100); -VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 204600, 10000); -VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 204600, 2000); -VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 5000); -VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 5000); +VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a, 1e-4, 0, 2046, 100); +VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 10000); +VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 2000); +VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 20460, 5000); +VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 20460, 5000); //VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000); -VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 0, 0, 204600, 5000); -VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 204600, 3000); -VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a, 0, 0, 204600, 5000); -VOLK_RUN_TESTS(volk_32f_x2_max_32f_a, 1e-4, 0, 204600, 2000); -VOLK_RUN_TESTS(volk_32f_x2_min_32f_a, 1e-4, 0, 204600, 2000); -VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_32f_s32f_normalize_a, 1e-4, 100, 204600, 10000); -VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a, 1e-4, 4, 204600, 100); -VOLK_RUN_TESTS(volk_32f_sqrt_32f_a, 1e-4, 0, 204600, 100); -VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a, 1e-4, 100, 204600, 3000); -VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a, 1e-4, 0, 204600, 3000); -VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a, 1e-4, 0, 204600, 5000); -VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a, 1e-4, 0, 204600, 5000); -VOLK_RUN_TESTS(volk_32i_x2_and_32i_a, 0, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a, 1e-4, 100, 204600, 10000); -VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 204600, 10000); -VOLK_RUN_TESTS(volk_32i_x2_or_32i_a, 0, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_32u_byteswap_a, 0, 0, 204600, 2000); +VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 0, 0, 20460, 5000); +VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a, 1, 32768, 20460, 3000); +VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a, 0, 0, 20460, 5000); +VOLK_RUN_TESTS(volk_32f_x2_max_32f_a, 1e-4, 0, 20460, 2000); +VOLK_RUN_TESTS(volk_32f_x2_min_32f_a, 1e-4, 0, 20460, 2000); +VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a, 1e-4, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_32f_s32f_normalize_a, 1e-4, 100, 20460, 10000); +VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a, 1e-4, 4, 20460, 100); +VOLK_RUN_TESTS(volk_32f_sqrt_32f_a, 1e-4, 0, 20460, 100); +VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a, 1e-4, 100, 20460, 3000); +VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a, 1e-4, 0, 20460, 3000); +VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a, 1e-4, 0, 20460, 5000); +VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a, 1e-4, 0, 20460, 5000); +VOLK_RUN_TESTS(volk_32i_x2_and_32i_a, 0, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a, 1e-4, 100, 20460, 10000); +VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 20460, 10000); +VOLK_RUN_TESTS(volk_32i_x2_or_32i_a, 0, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_32u_byteswap_a, 0, 0, 20460, 2000); //VOLK_RUN_TESTS(volk_32u_popcnt_a, 0, 0, 2046, 10000); -VOLK_RUN_TESTS(volk_64f_convert_32f_a, 1e-4, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_64f_x2_max_64f_a, 1e-4, 0, 204600, 1000); -VOLK_RUN_TESTS(volk_64f_x2_min_64f_a, 1e-4, 0, 204600, 1000); -VOLK_RUN_TESTS(volk_64u_byteswap_a, 0, 0, 204600, 1000); +VOLK_RUN_TESTS(volk_64f_convert_32f_a, 1e-4, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_64f_x2_max_64f_a, 1e-4, 0, 20460, 1000); +VOLK_RUN_TESTS(volk_64f_x2_min_64f_a, 1e-4, 0, 20460, 1000); +VOLK_RUN_TESTS(volk_64u_byteswap_a, 0, 0, 20460, 1000); //VOLK_RUN_TESTS(volk_64u_popcnt_a, 0, 0, 2046, 10000); -VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a, 0, 0, 204600, 3000); -VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a, 1e-4, 100, 204600, 3000); -VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a, 0, 256, 204600, 3000); -VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a, 1e-4, 100, 204600, 3000); -VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a, 0, 0, 204600, 10000); -VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a, 0, 0, 204600, 400); -VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a, 1e-4, 100, 204600, 400); -VOLK_RUN_TESTS(volk_8i_convert_16i_a, 0, 0, 204600, 20000); -VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 204600, 2000); -VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a, 1e-4, 100, 204600, 2000); -VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 204600, 2000); +VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a, 0, 0, 20460, 3000); +VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a, 1e-4, 100, 20460, 3000); +VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a, 0, 256, 20460, 3000); +VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a, 1e-4, 100, 20460, 3000); +VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a, 0, 0, 20460, 10000); +VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a, 0, 0, 20460, 400); +VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a, 1e-4, 100, 20460, 400); +VOLK_RUN_TESTS(volk_8i_convert_16i_a, 0, 0, 20460, 20000); +VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 20460, 2000); +VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a, 1e-4, 100, 20460, 2000); +VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 20460, 2000); |