From 239144659b29c0a5ecd83a34e0e57387a1060ed7 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Tue, 7 Dec 2010 18:50:28 -0500 Subject: Initial checkin for VOLK - Vector-Optimized Library of Kernels. This is a new SIMD library. It currently stands by itself under the GNU Radio tree and can be used separately. We will integrate the build process into GNU Raio and start building off of its functionality over time. --- volk/AUTHORS | 0 volk/COPYING | 674 +++++++++++++++++++++ volk/ChangeLog | 0 volk/INSTALL | 236 ++++++++ volk/Makefile.am | 56 ++ volk/Makefile.common | 46 ++ volk/NEWS | 0 volk/README | 1 + volk/apps/Makefile.am | 27 + volk/bootstrap | 29 + volk/config/Makefile.am | 60 ++ volk/config/acx_pthread.m4 | 275 +++++++++ volk/config/bnv_have_qt.m4 | 404 ++++++++++++ volk/config/cppunit.m4 | 80 +++ volk/config/gcc_version_workaround.m4 | 49 ++ volk/config/gr_lib64.m4 | 85 +++ volk/config/gr_libgnuradio_core_extra_ldflags.m4 | 40 ++ volk/config/gr_no_undefined.m4 | 44 ++ volk/config/gr_omnithread.m4 | 52 ++ volk/config/gr_pwin32.m4 | 146 +++++ volk/config/gr_set_md_cpu.m4 | 63 ++ volk/config/gr_sysv_shm.m4 | 36 ++ volk/config/lf_cc.m4 | 41 ++ volk/config/lf_cxx.m4 | 67 ++ volk/config/lf_warnings.m4 | 121 ++++ volk/config/lf_x11.m4 | 39 ++ volk/config/lv_configure.m4 | 110 ++++ volk/config/mkstemp.m4 | 89 +++ volk/config/onceonly.m4 | 63 ++ volk/config/pkg.m4 | 201 ++++++ volk/configure.ac | 62 ++ volk/data/Makefile.am | 30 + volk/doc/Makefile.am | 32 + volk/include/Makefile.am | 23 + volk/include/volk/Makefile.am | 133 ++++ volk/include/volk/archs.xml | 133 ++++ volk/include/volk/emit_omnilog.py | 13 + volk/include/volk/make_c.py | 76 +++ volk/include/volk/make_config_fixed.py | 21 + volk/include/volk/make_config_in.py | 13 + volk/include/volk/make_cpuid_generic_c.py | 38 ++ volk/include/volk/make_cpuid_h.py | 25 + volk/include/volk/make_cpuid_powerpc_c.py | 45 ++ volk/include/volk/make_cpuid_x86_c.py | 93 +++ volk/include/volk/make_environment_init_c.py | 33 + volk/include/volk/make_environment_init_h.py | 18 + volk/include/volk/make_h.py | 28 + volk/include/volk/make_init_c.py | 42 ++ volk/include/volk/make_init_h.py | 26 + volk/include/volk/make_mktables.py | 33 + volk/include/volk/make_proccpu_sim.py | 25 + volk/include/volk/make_registry.py | 62 ++ volk/include/volk/make_runtime.py | 34 ++ volk/include/volk/make_runtime_c.py | 47 ++ volk/include/volk/make_set_simd.py | 202 ++++++ volk/include/volk/make_typedefs.py | 23 + volk/include/volk/volk_16s_add_quad_aligned16.h | 136 +++++ .../volk/volk_16s_branch_4_state_8_aligned16.h | 194 ++++++ volk/include/volk/volk_16s_convert_32f_aligned16.h | 119 ++++ .../volk/volk_16s_convert_32f_unaligned16.h | 122 ++++ volk/include/volk/volk_16s_convert_8s_aligned16.h | 69 +++ .../include/volk/volk_16s_convert_8s_unaligned16.h | 71 +++ volk/include/volk/volk_16s_max_star_aligned16.h | 108 ++++ .../volk/volk_16s_max_star_horizontal_aligned16.h | 130 ++++ .../volk_16s_permute_and_scalar_add_aligned16.h | 139 +++++ .../volk/volk_16s_quad_max_star_aligned16.h | 191 ++++++ .../volk/volk_16sc_deinterleave_16s_aligned16.h | 146 +++++ .../volk/volk_16sc_deinterleave_32f_aligned16.h | 95 +++ .../volk_16sc_deinterleave_real_16s_aligned16.h | 120 ++++ .../volk_16sc_deinterleave_real_32f_aligned16.h | 125 ++++ .../volk_16sc_deinterleave_real_8s_aligned16.h | 83 +++ .../volk/volk_16sc_magnitude_16s_aligned16.h | 179 ++++++ .../volk/volk_16sc_magnitude_32f_aligned16.h | 163 +++++ volk/include/volk/volk_16u_byteswap_aligned16.h | 65 ++ volk/include/volk/volk_32f_accumulator_aligned16.h | 67 ++ volk/include/volk/volk_32f_add_aligned16.h | 69 +++ .../volk_32f_calc_spectral_noise_floor_aligned16.h | 167 +++++ volk/include/volk/volk_32f_convert_16s_aligned16.h | 110 ++++ .../volk/volk_32f_convert_16s_unaligned16.h | 113 ++++ volk/include/volk/volk_32f_convert_32s_aligned16.h | 106 ++++ .../volk/volk_32f_convert_32s_unaligned16.h | 109 ++++ volk/include/volk/volk_32f_convert_64f_aligned16.h | 70 +++ .../volk/volk_32f_convert_64f_unaligned16.h | 70 +++ volk/include/volk/volk_32f_convert_8s_aligned16.h | 117 ++++ .../include/volk/volk_32f_convert_8s_unaligned16.h | 120 ++++ volk/include/volk/volk_32f_divide_aligned16.h | 69 +++ volk/include/volk/volk_32f_dot_prod_aligned16.h | 184 ++++++ volk/include/volk/volk_32f_dot_prod_unaligned16.h | 184 ++++++ volk/include/volk/volk_32f_fm_detect_aligned16.h | 120 ++++ volk/include/volk/volk_32f_index_max_aligned16.h | 148 +++++ .../volk/volk_32f_interleave_16sc_aligned16.h | 155 +++++ .../volk/volk_32f_interleave_32fc_aligned16.h | 75 +++ volk/include/volk/volk_32f_max_aligned16.h | 71 +++ volk/include/volk/volk_32f_min_aligned16.h | 71 +++ volk/include/volk/volk_32f_multiply_aligned16.h | 69 +++ volk/include/volk/volk_32f_normalize_aligned16.h | 66 ++ volk/include/volk/volk_32f_power_aligned16.h | 144 +++++ volk/include/volk/volk_32f_sqrt_aligned16.h | 64 ++ volk/include/volk/volk_32f_stddev_aligned16.h | 144 +++++ .../volk/volk_32f_stddev_and_mean_aligned16.h | 169 ++++++ volk/include/volk/volk_32f_subtract_aligned16.h | 67 ++ volk/include/volk/volk_32f_sum_of_poly_aligned16.h | 151 +++++ .../volk/volk_32fc_32f_multiply_aligned16.h | 82 +++ .../volk/volk_32fc_32f_power_32fc_aligned16.h | 109 ++++ volk/include/volk/volk_32fc_atan2_32f_aligned16.h | 158 +++++ .../volk/volk_32fc_conjugate_dot_prod_aligned16.h | 344 +++++++++++ .../volk/volk_32fc_deinterleave_32f_aligned16.h | 75 +++ .../volk/volk_32fc_deinterleave_64f_aligned16.h | 78 +++ .../volk_32fc_deinterleave_real_16s_aligned16.h | 80 +++ .../volk_32fc_deinterleave_real_32f_aligned16.h | 68 +++ .../volk_32fc_deinterleave_real_64f_aligned16.h | 66 ++ volk/include/volk/volk_32fc_dot_prod_aligned16.h | 468 ++++++++++++++ volk/include/volk/volk_32fc_index_max_aligned16.h | 215 +++++++ .../volk/volk_32fc_magnitude_16s_aligned16.h | 146 +++++ .../volk/volk_32fc_magnitude_32f_aligned16.h | 121 ++++ volk/include/volk/volk_32fc_multiply_aligned16.h | 78 +++ ...olk_32fc_power_spectral_density_32f_aligned16.h | 134 ++++ .../volk/volk_32fc_power_spectrum_32f_aligned16.h | 126 ++++ .../include/volk/volk_32fc_square_dist_aligned16.h | 112 ++++ .../volk_32fc_square_dist_scalar_mult_aligned16.h | 126 ++++ volk/include/volk/volk_32s_and_aligned16.h | 69 +++ volk/include/volk/volk_32s_convert_32f_aligned16.h | 73 +++ .../volk/volk_32s_convert_32f_unaligned16.h | 75 +++ volk/include/volk/volk_32s_or_aligned16.h | 69 +++ volk/include/volk/volk_32u_byteswap_aligned16.h | 77 +++ volk/include/volk/volk_32u_popcnt_aligned16.h | 36 ++ volk/include/volk/volk_64f_convert_32f_aligned16.h | 67 ++ .../volk/volk_64f_convert_32f_unaligned16.h | 67 ++ volk/include/volk/volk_64f_max_aligned16.h | 71 +++ volk/include/volk/volk_64f_min_aligned16.h | 71 +++ volk/include/volk/volk_64u_byteswap_aligned16.h | 88 +++ volk/include/volk/volk_64u_popcnt_aligned16.h | 74 +++ volk/include/volk/volk_8s_convert_16s_aligned16.h | 71 +++ .../include/volk/volk_8s_convert_16s_unaligned16.h | 73 +++ volk/include/volk/volk_8s_convert_32f_aligned16.h | 92 +++ .../include/volk/volk_8s_convert_32f_unaligned16.h | 94 +++ .../volk/volk_8sc_deinterleave_16s_aligned16.h | 77 +++ .../volk/volk_8sc_deinterleave_32f_aligned16.h | 164 +++++ .../volk_8sc_deinterleave_real_16s_aligned16.h | 66 ++ .../volk_8sc_deinterleave_real_32f_aligned16.h | 133 ++++ .../volk/volk_8sc_deinterleave_real_8s_aligned16.h | 67 ++ .../volk_8sc_multiply_conjugate_16sc_aligned16.h | 102 ++++ .../volk_8sc_multiply_conjugate_32fc_aligned16.h | 122 ++++ volk/include/volk/volk_common.h | 18 + volk/include/volk/volk_complex.h | 71 +++ volk/include/volk/volk_regexp.py | 8 + volk/include/volk/volk_register.py | 279 +++++++++ volk/lib/Makefile.am | 361 +++++++++++ volk/lib/assembly.h | 67 ++ volk/lib/cpuid_x86.S | 60 ++ volk/lib/cpuid_x86_64.S | 54 ++ volk/lib/qa_16s_add_quad_aligned16.cc | 89 +++ volk/lib/qa_16s_add_quad_aligned16.h | 18 + volk/lib/qa_16s_branch_4_state_8_aligned16.cc | 106 ++++ volk/lib/qa_16s_branch_4_state_8_aligned16.h | 18 + volk/lib/qa_16s_convert_32f_aligned16.cc | 73 +++ volk/lib/qa_16s_convert_32f_aligned16.h | 18 + volk/lib/qa_16s_convert_32f_unaligned16.cc | 73 +++ volk/lib/qa_16s_convert_32f_unaligned16.h | 18 + volk/lib/qa_16s_convert_8s_aligned16.cc | 60 ++ volk/lib/qa_16s_convert_8s_aligned16.h | 18 + volk/lib/qa_16s_convert_8s_unaligned16.cc | 60 ++ volk/lib/qa_16s_convert_8s_unaligned16.h | 18 + volk/lib/qa_16s_max_star_aligned16.cc | 65 ++ volk/lib/qa_16s_max_star_aligned16.h | 18 + volk/lib/qa_16s_max_star_horizontal_aligned16.cc | 79 +++ volk/lib/qa_16s_max_star_horizontal_aligned16.h | 18 + .../lib/qa_16s_permute_and_scalar_add_aligned16.cc | 78 +++ volk/lib/qa_16s_permute_and_scalar_add_aligned16.h | 18 + volk/lib/qa_16s_quad_max_star_aligned16.cc | 59 ++ volk/lib/qa_16s_quad_max_star_aligned16.h | 18 + volk/lib/qa_16sc_deinterleave_16s_aligned16.cc | 76 +++ volk/lib/qa_16sc_deinterleave_16s_aligned16.h | 18 + volk/lib/qa_16sc_deinterleave_32f_aligned16.cc | 63 ++ volk/lib/qa_16sc_deinterleave_32f_aligned16.h | 18 + .../lib/qa_16sc_deinterleave_real_16s_aligned16.cc | 71 +++ volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h | 18 + .../lib/qa_16sc_deinterleave_real_32f_aligned16.cc | 123 ++++ volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h | 18 + volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc | 60 ++ volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h | 18 + volk/lib/qa_16sc_magnitude_16s_aligned16.cc | 70 +++ volk/lib/qa_16sc_magnitude_16s_aligned16.h | 18 + volk/lib/qa_16sc_magnitude_32f_aligned16.cc | 70 +++ volk/lib/qa_16sc_magnitude_32f_aligned16.h | 18 + volk/lib/qa_16u_byteswap_aligned16.cc | 60 ++ volk/lib/qa_16u_byteswap_aligned16.h | 18 + volk/lib/qa_32f_accumulator_aligned16.cc | 56 ++ volk/lib/qa_32f_accumulator_aligned16.h | 18 + volk/lib/qa_32f_add_aligned16.cc | 60 ++ volk/lib/qa_32f_add_aligned16.h | 18 + .../qa_32f_calc_spectral_noise_floor_aligned16.cc | 59 ++ .../qa_32f_calc_spectral_noise_floor_aligned16.h | 18 + volk/lib/qa_32f_convert_16s_aligned16.cc | 70 +++ volk/lib/qa_32f_convert_16s_aligned16.h | 18 + volk/lib/qa_32f_convert_16s_unaligned16.cc | 70 +++ volk/lib/qa_32f_convert_16s_unaligned16.h | 18 + volk/lib/qa_32f_convert_32s_aligned16.cc | 70 +++ volk/lib/qa_32f_convert_32s_aligned16.h | 18 + volk/lib/qa_32f_convert_32s_unaligned16.cc | 70 +++ volk/lib/qa_32f_convert_32s_unaligned16.h | 18 + volk/lib/qa_32f_convert_64f_aligned16.cc | 60 ++ volk/lib/qa_32f_convert_64f_aligned16.h | 18 + volk/lib/qa_32f_convert_64f_unaligned16.cc | 60 ++ volk/lib/qa_32f_convert_64f_unaligned16.h | 18 + volk/lib/qa_32f_convert_8s_aligned16.cc | 70 +++ volk/lib/qa_32f_convert_8s_aligned16.h | 18 + volk/lib/qa_32f_convert_8s_unaligned16.cc | 70 +++ volk/lib/qa_32f_convert_8s_unaligned16.h | 18 + volk/lib/qa_32f_divide_aligned16.cc | 60 ++ volk/lib/qa_32f_divide_aligned16.h | 18 + volk/lib/qa_32f_dot_prod_aligned16.cc | 183 ++++++ volk/lib/qa_32f_dot_prod_aligned16.h | 18 + volk/lib/qa_32f_dot_prod_unaligned16.cc | 190 ++++++ volk/lib/qa_32f_dot_prod_unaligned16.h | 18 + volk/lib/qa_32f_fm_detect_aligned16.cc | 60 ++ volk/lib/qa_32f_fm_detect_aligned16.h | 18 + volk/lib/qa_32f_index_max_aligned16.cc | 103 ++++ volk/lib/qa_32f_index_max_aligned16.h | 18 + volk/lib/qa_32f_interleave_16sc_aligned16.cc | 75 +++ volk/lib/qa_32f_interleave_16sc_aligned16.h | 18 + volk/lib/qa_32f_interleave_32fc_aligned16.cc | 62 ++ volk/lib/qa_32f_interleave_32fc_aligned16.h | 18 + volk/lib/qa_32f_max_aligned16.cc | 60 ++ volk/lib/qa_32f_max_aligned16.h | 18 + volk/lib/qa_32f_min_aligned16.cc | 60 ++ volk/lib/qa_32f_min_aligned16.h | 18 + volk/lib/qa_32f_multiply_aligned16.cc | 60 ++ volk/lib/qa_32f_multiply_aligned16.h | 18 + volk/lib/qa_32f_normalize_aligned16.cc | 65 ++ volk/lib/qa_32f_normalize_aligned16.h | 18 + volk/lib/qa_32f_power_aligned16.cc | 95 +++ volk/lib/qa_32f_power_aligned16.h | 18 + volk/lib/qa_32f_sqrt_aligned16.cc | 59 ++ volk/lib/qa_32f_sqrt_aligned16.h | 18 + volk/lib/qa_32f_stddev_aligned16.cc | 74 +++ volk/lib/qa_32f_stddev_aligned16.h | 18 + volk/lib/qa_32f_stddev_and_mean_aligned16.cc | 75 +++ volk/lib/qa_32f_stddev_and_mean_aligned16.h | 18 + volk/lib/qa_32f_subtract_aligned16.cc | 60 ++ volk/lib/qa_32f_subtract_aligned16.h | 18 + volk/lib/qa_32f_sum_of_poly_aligned16.cc | 142 +++++ volk/lib/qa_32f_sum_of_poly_aligned16.h | 18 + volk/lib/qa_32fc_32f_multiply_aligned16.cc | 85 +++ volk/lib/qa_32fc_32f_multiply_aligned16.h | 18 + volk/lib/qa_32fc_32f_power_32fc_aligned16.cc | 83 +++ volk/lib/qa_32fc_32f_power_32fc_aligned16.h | 18 + volk/lib/qa_32fc_atan2_32f_aligned16.cc | 75 +++ volk/lib/qa_32fc_atan2_32f_aligned16.h | 18 + volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc | 137 +++++ volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h | 18 + volk/lib/qa_32fc_deinterleave_32f_aligned16.cc | 63 ++ volk/lib/qa_32fc_deinterleave_32f_aligned16.h | 18 + volk/lib/qa_32fc_deinterleave_64f_aligned16.cc | 63 ++ volk/lib/qa_32fc_deinterleave_64f_aligned16.h | 18 + .../lib/qa_32fc_deinterleave_real_16s_aligned16.cc | 60 ++ volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h | 18 + .../lib/qa_32fc_deinterleave_real_32f_aligned16.cc | 60 ++ volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h | 18 + .../lib/qa_32fc_deinterleave_real_64f_aligned16.cc | 60 ++ volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h | 18 + volk/lib/qa_32fc_dot_prod_aligned16.cc | 214 +++++++ volk/lib/qa_32fc_dot_prod_aligned16.h | 20 + volk/lib/qa_32fc_index_max_aligned16.cc | 89 +++ volk/lib/qa_32fc_index_max_aligned16.h | 18 + volk/lib/qa_32fc_magnitude_16s_aligned16.cc | 70 +++ volk/lib/qa_32fc_magnitude_16s_aligned16.h | 18 + volk/lib/qa_32fc_magnitude_32f_aligned16.cc | 70 +++ volk/lib/qa_32fc_magnitude_32f_aligned16.h | 18 + volk/lib/qa_32fc_multiply_aligned16.cc | 86 +++ volk/lib/qa_32fc_multiply_aligned16.h | 18 + ...qa_32fc_power_spectral_density_32f_aligned16.cc | 63 ++ .../qa_32fc_power_spectral_density_32f_aligned16.h | 18 + volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc | 63 ++ volk/lib/qa_32fc_power_spectrum_32f_aligned16.h | 18 + volk/lib/qa_32fc_square_dist_aligned16.cc | 91 +++ volk/lib/qa_32fc_square_dist_aligned16.h | 18 + .../qa_32fc_square_dist_scalar_mult_aligned16.cc | 96 +++ .../qa_32fc_square_dist_scalar_mult_aligned16.h | 18 + volk/lib/qa_32s_and_aligned16.cc | 60 ++ volk/lib/qa_32s_and_aligned16.h | 18 + volk/lib/qa_32s_convert_32f_aligned16.cc | 60 ++ volk/lib/qa_32s_convert_32f_aligned16.h | 18 + volk/lib/qa_32s_convert_32f_unaligned16.cc | 60 ++ volk/lib/qa_32s_convert_32f_unaligned16.h | 18 + volk/lib/qa_32s_or_aligned16.cc | 60 ++ volk/lib/qa_32s_or_aligned16.h | 18 + volk/lib/qa_32u_byteswap_aligned16.cc | 59 ++ volk/lib/qa_32u_byteswap_aligned16.h | 18 + volk/lib/qa_32u_popcnt_aligned16.cc | 61 ++ volk/lib/qa_32u_popcnt_aligned16.h | 18 + volk/lib/qa_64f_convert_32f_aligned16.cc | 60 ++ volk/lib/qa_64f_convert_32f_aligned16.h | 18 + volk/lib/qa_64f_convert_32f_unaligned16.cc | 60 ++ volk/lib/qa_64f_convert_32f_unaligned16.h | 18 + volk/lib/qa_64f_max_aligned16.cc | 60 ++ volk/lib/qa_64f_max_aligned16.h | 18 + volk/lib/qa_64f_min_aligned16.cc | 60 ++ volk/lib/qa_64f_min_aligned16.h | 18 + volk/lib/qa_64u_byteswap_aligned16.cc | 59 ++ volk/lib/qa_64u_byteswap_aligned16.h | 18 + volk/lib/qa_64u_popcnt_aligned16.cc | 61 ++ volk/lib/qa_64u_popcnt_aligned16.h | 18 + volk/lib/qa_8s_convert_16s_aligned16.cc | 63 ++ volk/lib/qa_8s_convert_16s_aligned16.h | 18 + volk/lib/qa_8s_convert_16s_unaligned16.cc | 63 ++ volk/lib/qa_8s_convert_16s_unaligned16.h | 18 + volk/lib/qa_8s_convert_32f_aligned16.cc | 63 ++ volk/lib/qa_8s_convert_32f_aligned16.h | 18 + volk/lib/qa_8s_convert_32f_unaligned16.cc | 63 ++ volk/lib/qa_8s_convert_32f_unaligned16.h | 18 + volk/lib/qa_8sc_deinterleave_16s_aligned16.cc | 67 ++ volk/lib/qa_8sc_deinterleave_16s_aligned16.h | 18 + volk/lib/qa_8sc_deinterleave_32f_aligned16.cc | 134 ++++ volk/lib/qa_8sc_deinterleave_32f_aligned16.h | 18 + volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc | 64 ++ volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h | 18 + volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc | 138 +++++ volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h | 18 + volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc | 60 ++ volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h | 18 + .../qa_8sc_multiply_conjugate_16sc_aligned16.cc | 87 +++ .../lib/qa_8sc_multiply_conjugate_16sc_aligned16.h | 18 + .../qa_8sc_multiply_conjugate_32fc_aligned16.cc | 87 +++ .../lib/qa_8sc_multiply_conjugate_32fc_aligned16.h | 18 + volk/lib/qa_volk.cc | 211 +++++++ volk/lib/qa_volk.h | 36 ++ volk/lib/test_all.cc | 82 +++ volk/lib/volk_rank_archs.c | 13 + volk/lib/volk_rank_archs.h | 14 + volk/libvector_replace.sh | 18 + volk/python/Makefile.am | 95 +++ volk/python/__init__.py | 53 ++ volk/python/libvector.i | 47 ++ volk/python/libvector_square_ff.i | 37 ++ volk/python/qa_square.py | 47 ++ volk/python/run_tests.in | 50 ++ volk/spu_lib/gc_spu_macs.h | 380 ++++++++++++ volk/spu_lib/spu_16s_cmpgt_unaligned.c | 160 +++++ volk/spu_lib/spu_16s_vector_subtract_unaligned.c | 178 ++++++ volk/spu_lib/spu_16s_vector_sum_unaligned.c | 178 ++++++ .../spu_32fc_pointwise_multiply_unaligned.c | 222 +++++++ volk/spu_lib/spu_memcpy_unaligned.c | 290 +++++++++ volk/spu_lib/spu_memset_unaligned.S | 185 ++++++ volk/system_cleanup.sh | 29 + volk/volk.pc.in | 15 + 346 files changed, 25748 insertions(+) create mode 100644 volk/AUTHORS create mode 100644 volk/COPYING create mode 100644 volk/ChangeLog create mode 100644 volk/INSTALL create mode 100644 volk/Makefile.am create mode 100644 volk/Makefile.common create mode 100644 volk/NEWS create mode 100644 volk/README create mode 100644 volk/apps/Makefile.am create mode 100755 volk/bootstrap create mode 100644 volk/config/Makefile.am create mode 100644 volk/config/acx_pthread.m4 create mode 100644 volk/config/bnv_have_qt.m4 create mode 100644 volk/config/cppunit.m4 create mode 100644 volk/config/gcc_version_workaround.m4 create mode 100644 volk/config/gr_lib64.m4 create mode 100644 volk/config/gr_libgnuradio_core_extra_ldflags.m4 create mode 100644 volk/config/gr_no_undefined.m4 create mode 100644 volk/config/gr_omnithread.m4 create mode 100644 volk/config/gr_pwin32.m4 create mode 100644 volk/config/gr_set_md_cpu.m4 create mode 100644 volk/config/gr_sysv_shm.m4 create mode 100644 volk/config/lf_cc.m4 create mode 100644 volk/config/lf_cxx.m4 create mode 100644 volk/config/lf_warnings.m4 create mode 100644 volk/config/lf_x11.m4 create mode 100644 volk/config/lv_configure.m4 create mode 100644 volk/config/mkstemp.m4 create mode 100644 volk/config/onceonly.m4 create mode 100644 volk/config/pkg.m4 create mode 100644 volk/configure.ac create mode 100644 volk/data/Makefile.am create mode 100644 volk/doc/Makefile.am create mode 100644 volk/include/Makefile.am create mode 100644 volk/include/volk/Makefile.am create mode 100644 volk/include/volk/archs.xml create mode 100644 volk/include/volk/emit_omnilog.py create mode 100644 volk/include/volk/make_c.py create mode 100644 volk/include/volk/make_config_fixed.py create mode 100644 volk/include/volk/make_config_in.py create mode 100644 volk/include/volk/make_cpuid_generic_c.py create mode 100644 volk/include/volk/make_cpuid_h.py create mode 100644 volk/include/volk/make_cpuid_powerpc_c.py create mode 100644 volk/include/volk/make_cpuid_x86_c.py create mode 100644 volk/include/volk/make_environment_init_c.py create mode 100644 volk/include/volk/make_environment_init_h.py create mode 100644 volk/include/volk/make_h.py create mode 100644 volk/include/volk/make_init_c.py create mode 100644 volk/include/volk/make_init_h.py create mode 100644 volk/include/volk/make_mktables.py create mode 100644 volk/include/volk/make_proccpu_sim.py create mode 100644 volk/include/volk/make_registry.py create mode 100644 volk/include/volk/make_runtime.py create mode 100644 volk/include/volk/make_runtime_c.py create mode 100644 volk/include/volk/make_set_simd.py create mode 100644 volk/include/volk/make_typedefs.py create mode 100644 volk/include/volk/volk_16s_add_quad_aligned16.h create mode 100644 volk/include/volk/volk_16s_branch_4_state_8_aligned16.h create mode 100644 volk/include/volk/volk_16s_convert_32f_aligned16.h create mode 100644 volk/include/volk/volk_16s_convert_32f_unaligned16.h create mode 100644 volk/include/volk/volk_16s_convert_8s_aligned16.h create mode 100644 volk/include/volk/volk_16s_convert_8s_unaligned16.h create mode 100644 volk/include/volk/volk_16s_max_star_aligned16.h create mode 100644 volk/include/volk/volk_16s_max_star_horizontal_aligned16.h create mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h create mode 100644 volk/include/volk/volk_16s_quad_max_star_aligned16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h create mode 100644 volk/include/volk/volk_16sc_magnitude_16s_aligned16.h create mode 100644 volk/include/volk/volk_16sc_magnitude_32f_aligned16.h create mode 100644 volk/include/volk/volk_16u_byteswap_aligned16.h create mode 100644 volk/include/volk/volk_32f_accumulator_aligned16.h create mode 100644 volk/include/volk/volk_32f_add_aligned16.h create mode 100644 volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h create mode 100644 volk/include/volk/volk_32f_convert_16s_aligned16.h create mode 100644 volk/include/volk/volk_32f_convert_16s_unaligned16.h create mode 100644 volk/include/volk/volk_32f_convert_32s_aligned16.h create mode 100644 volk/include/volk/volk_32f_convert_32s_unaligned16.h create mode 100644 volk/include/volk/volk_32f_convert_64f_aligned16.h create mode 100644 volk/include/volk/volk_32f_convert_64f_unaligned16.h create mode 100644 volk/include/volk/volk_32f_convert_8s_aligned16.h create mode 100644 volk/include/volk/volk_32f_convert_8s_unaligned16.h create mode 100644 volk/include/volk/volk_32f_divide_aligned16.h create mode 100644 volk/include/volk/volk_32f_dot_prod_aligned16.h create mode 100644 volk/include/volk/volk_32f_dot_prod_unaligned16.h create mode 100644 volk/include/volk/volk_32f_fm_detect_aligned16.h create mode 100644 volk/include/volk/volk_32f_index_max_aligned16.h create mode 100644 volk/include/volk/volk_32f_interleave_16sc_aligned16.h create mode 100644 volk/include/volk/volk_32f_interleave_32fc_aligned16.h create mode 100644 volk/include/volk/volk_32f_max_aligned16.h create mode 100644 volk/include/volk/volk_32f_min_aligned16.h create mode 100644 volk/include/volk/volk_32f_multiply_aligned16.h create mode 100644 volk/include/volk/volk_32f_normalize_aligned16.h create mode 100644 volk/include/volk/volk_32f_power_aligned16.h create mode 100644 volk/include/volk/volk_32f_sqrt_aligned16.h create mode 100644 volk/include/volk/volk_32f_stddev_aligned16.h create mode 100644 volk/include/volk/volk_32f_stddev_and_mean_aligned16.h create mode 100644 volk/include/volk/volk_32f_subtract_aligned16.h create mode 100644 volk/include/volk/volk_32f_sum_of_poly_aligned16.h create mode 100644 volk/include/volk/volk_32fc_32f_multiply_aligned16.h create mode 100644 volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h create mode 100644 volk/include/volk/volk_32fc_atan2_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_dot_prod_aligned16.h create mode 100644 volk/include/volk/volk_32fc_index_max_aligned16.h create mode 100644 volk/include/volk/volk_32fc_magnitude_16s_aligned16.h create mode 100644 volk/include/volk/volk_32fc_magnitude_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_multiply_aligned16.h create mode 100644 volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_square_dist_aligned16.h create mode 100644 volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h create mode 100644 volk/include/volk/volk_32s_and_aligned16.h create mode 100644 volk/include/volk/volk_32s_convert_32f_aligned16.h create mode 100644 volk/include/volk/volk_32s_convert_32f_unaligned16.h create mode 100644 volk/include/volk/volk_32s_or_aligned16.h create mode 100644 volk/include/volk/volk_32u_byteswap_aligned16.h create mode 100644 volk/include/volk/volk_32u_popcnt_aligned16.h create mode 100644 volk/include/volk/volk_64f_convert_32f_aligned16.h create mode 100644 volk/include/volk/volk_64f_convert_32f_unaligned16.h create mode 100644 volk/include/volk/volk_64f_max_aligned16.h create mode 100644 volk/include/volk/volk_64f_min_aligned16.h create mode 100644 volk/include/volk/volk_64u_byteswap_aligned16.h create mode 100644 volk/include/volk/volk_64u_popcnt_aligned16.h create mode 100644 volk/include/volk/volk_8s_convert_16s_aligned16.h create mode 100644 volk/include/volk/volk_8s_convert_16s_unaligned16.h create mode 100644 volk/include/volk/volk_8s_convert_32f_aligned16.h create mode 100644 volk/include/volk/volk_8s_convert_32f_unaligned16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h create mode 100644 volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h create mode 100644 volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h create mode 100644 volk/include/volk/volk_common.h create mode 100644 volk/include/volk/volk_complex.h create mode 100644 volk/include/volk/volk_regexp.py create mode 100755 volk/include/volk/volk_register.py create mode 100644 volk/lib/Makefile.am create mode 100644 volk/lib/assembly.h create mode 100644 volk/lib/cpuid_x86.S create mode 100644 volk/lib/cpuid_x86_64.S create mode 100644 volk/lib/qa_16s_add_quad_aligned16.cc create mode 100644 volk/lib/qa_16s_add_quad_aligned16.h create mode 100644 volk/lib/qa_16s_branch_4_state_8_aligned16.cc create mode 100644 volk/lib/qa_16s_branch_4_state_8_aligned16.h create mode 100644 volk/lib/qa_16s_convert_32f_aligned16.cc create mode 100644 volk/lib/qa_16s_convert_32f_aligned16.h create mode 100644 volk/lib/qa_16s_convert_32f_unaligned16.cc create mode 100644 volk/lib/qa_16s_convert_32f_unaligned16.h create mode 100644 volk/lib/qa_16s_convert_8s_aligned16.cc create mode 100644 volk/lib/qa_16s_convert_8s_aligned16.h create mode 100644 volk/lib/qa_16s_convert_8s_unaligned16.cc create mode 100644 volk/lib/qa_16s_convert_8s_unaligned16.h create mode 100644 volk/lib/qa_16s_max_star_aligned16.cc create mode 100644 volk/lib/qa_16s_max_star_aligned16.h create mode 100644 volk/lib/qa_16s_max_star_horizontal_aligned16.cc create mode 100644 volk/lib/qa_16s_max_star_horizontal_aligned16.h create mode 100644 volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc create mode 100644 volk/lib/qa_16s_permute_and_scalar_add_aligned16.h create mode 100644 volk/lib/qa_16s_quad_max_star_aligned16.cc create mode 100644 volk/lib/qa_16s_quad_max_star_aligned16.h create mode 100644 volk/lib/qa_16sc_deinterleave_16s_aligned16.cc create mode 100644 volk/lib/qa_16sc_deinterleave_16s_aligned16.h create mode 100644 volk/lib/qa_16sc_deinterleave_32f_aligned16.cc create mode 100644 volk/lib/qa_16sc_deinterleave_32f_aligned16.h create mode 100644 volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc create mode 100644 volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h create mode 100644 volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc create mode 100644 volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h create mode 100644 volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc create mode 100644 volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h create mode 100644 volk/lib/qa_16sc_magnitude_16s_aligned16.cc create mode 100644 volk/lib/qa_16sc_magnitude_16s_aligned16.h create mode 100644 volk/lib/qa_16sc_magnitude_32f_aligned16.cc create mode 100644 volk/lib/qa_16sc_magnitude_32f_aligned16.h create mode 100644 volk/lib/qa_16u_byteswap_aligned16.cc create mode 100644 volk/lib/qa_16u_byteswap_aligned16.h create mode 100644 volk/lib/qa_32f_accumulator_aligned16.cc create mode 100644 volk/lib/qa_32f_accumulator_aligned16.h create mode 100644 volk/lib/qa_32f_add_aligned16.cc create mode 100644 volk/lib/qa_32f_add_aligned16.h create mode 100644 volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc create mode 100644 volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h create mode 100644 volk/lib/qa_32f_convert_16s_aligned16.cc create mode 100644 volk/lib/qa_32f_convert_16s_aligned16.h create mode 100644 volk/lib/qa_32f_convert_16s_unaligned16.cc create mode 100644 volk/lib/qa_32f_convert_16s_unaligned16.h create mode 100644 volk/lib/qa_32f_convert_32s_aligned16.cc create mode 100644 volk/lib/qa_32f_convert_32s_aligned16.h create mode 100644 volk/lib/qa_32f_convert_32s_unaligned16.cc create mode 100644 volk/lib/qa_32f_convert_32s_unaligned16.h create mode 100644 volk/lib/qa_32f_convert_64f_aligned16.cc create mode 100644 volk/lib/qa_32f_convert_64f_aligned16.h create mode 100644 volk/lib/qa_32f_convert_64f_unaligned16.cc create mode 100644 volk/lib/qa_32f_convert_64f_unaligned16.h create mode 100644 volk/lib/qa_32f_convert_8s_aligned16.cc create mode 100644 volk/lib/qa_32f_convert_8s_aligned16.h create mode 100644 volk/lib/qa_32f_convert_8s_unaligned16.cc create mode 100644 volk/lib/qa_32f_convert_8s_unaligned16.h create mode 100644 volk/lib/qa_32f_divide_aligned16.cc create mode 100644 volk/lib/qa_32f_divide_aligned16.h create mode 100644 volk/lib/qa_32f_dot_prod_aligned16.cc create mode 100644 volk/lib/qa_32f_dot_prod_aligned16.h create mode 100644 volk/lib/qa_32f_dot_prod_unaligned16.cc create mode 100644 volk/lib/qa_32f_dot_prod_unaligned16.h create mode 100644 volk/lib/qa_32f_fm_detect_aligned16.cc create mode 100644 volk/lib/qa_32f_fm_detect_aligned16.h create mode 100644 volk/lib/qa_32f_index_max_aligned16.cc create mode 100644 volk/lib/qa_32f_index_max_aligned16.h create mode 100644 volk/lib/qa_32f_interleave_16sc_aligned16.cc create mode 100644 volk/lib/qa_32f_interleave_16sc_aligned16.h create mode 100644 volk/lib/qa_32f_interleave_32fc_aligned16.cc create mode 100644 volk/lib/qa_32f_interleave_32fc_aligned16.h create mode 100644 volk/lib/qa_32f_max_aligned16.cc create mode 100644 volk/lib/qa_32f_max_aligned16.h create mode 100644 volk/lib/qa_32f_min_aligned16.cc create mode 100644 volk/lib/qa_32f_min_aligned16.h create mode 100644 volk/lib/qa_32f_multiply_aligned16.cc create mode 100644 volk/lib/qa_32f_multiply_aligned16.h create mode 100644 volk/lib/qa_32f_normalize_aligned16.cc create mode 100644 volk/lib/qa_32f_normalize_aligned16.h create mode 100644 volk/lib/qa_32f_power_aligned16.cc create mode 100644 volk/lib/qa_32f_power_aligned16.h create mode 100644 volk/lib/qa_32f_sqrt_aligned16.cc create mode 100644 volk/lib/qa_32f_sqrt_aligned16.h create mode 100644 volk/lib/qa_32f_stddev_aligned16.cc create mode 100644 volk/lib/qa_32f_stddev_aligned16.h create mode 100644 volk/lib/qa_32f_stddev_and_mean_aligned16.cc create mode 100644 volk/lib/qa_32f_stddev_and_mean_aligned16.h create mode 100644 volk/lib/qa_32f_subtract_aligned16.cc create mode 100644 volk/lib/qa_32f_subtract_aligned16.h create mode 100644 volk/lib/qa_32f_sum_of_poly_aligned16.cc create mode 100644 volk/lib/qa_32f_sum_of_poly_aligned16.h create mode 100644 volk/lib/qa_32fc_32f_multiply_aligned16.cc create mode 100644 volk/lib/qa_32fc_32f_multiply_aligned16.h create mode 100644 volk/lib/qa_32fc_32f_power_32fc_aligned16.cc create mode 100644 volk/lib/qa_32fc_32f_power_32fc_aligned16.h create mode 100644 volk/lib/qa_32fc_atan2_32f_aligned16.cc create mode 100644 volk/lib/qa_32fc_atan2_32f_aligned16.h create mode 100644 volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc create mode 100644 volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h create mode 100644 volk/lib/qa_32fc_deinterleave_32f_aligned16.cc create mode 100644 volk/lib/qa_32fc_deinterleave_32f_aligned16.h create mode 100644 volk/lib/qa_32fc_deinterleave_64f_aligned16.cc create mode 100644 volk/lib/qa_32fc_deinterleave_64f_aligned16.h create mode 100644 volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc create mode 100644 volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h create mode 100644 volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc create mode 100644 volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h create mode 100644 volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc create mode 100644 volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h create mode 100644 volk/lib/qa_32fc_dot_prod_aligned16.cc create mode 100644 volk/lib/qa_32fc_dot_prod_aligned16.h create mode 100644 volk/lib/qa_32fc_index_max_aligned16.cc create mode 100644 volk/lib/qa_32fc_index_max_aligned16.h create mode 100644 volk/lib/qa_32fc_magnitude_16s_aligned16.cc create mode 100644 volk/lib/qa_32fc_magnitude_16s_aligned16.h create mode 100644 volk/lib/qa_32fc_magnitude_32f_aligned16.cc create mode 100644 volk/lib/qa_32fc_magnitude_32f_aligned16.h create mode 100644 volk/lib/qa_32fc_multiply_aligned16.cc create mode 100644 volk/lib/qa_32fc_multiply_aligned16.h create mode 100644 volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc create mode 100644 volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h create mode 100644 volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc create mode 100644 volk/lib/qa_32fc_power_spectrum_32f_aligned16.h create mode 100644 volk/lib/qa_32fc_square_dist_aligned16.cc create mode 100644 volk/lib/qa_32fc_square_dist_aligned16.h create mode 100644 volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc create mode 100644 volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h create mode 100644 volk/lib/qa_32s_and_aligned16.cc create mode 100644 volk/lib/qa_32s_and_aligned16.h create mode 100644 volk/lib/qa_32s_convert_32f_aligned16.cc create mode 100644 volk/lib/qa_32s_convert_32f_aligned16.h create mode 100644 volk/lib/qa_32s_convert_32f_unaligned16.cc create mode 100644 volk/lib/qa_32s_convert_32f_unaligned16.h create mode 100644 volk/lib/qa_32s_or_aligned16.cc create mode 100644 volk/lib/qa_32s_or_aligned16.h create mode 100644 volk/lib/qa_32u_byteswap_aligned16.cc create mode 100644 volk/lib/qa_32u_byteswap_aligned16.h create mode 100644 volk/lib/qa_32u_popcnt_aligned16.cc create mode 100644 volk/lib/qa_32u_popcnt_aligned16.h create mode 100644 volk/lib/qa_64f_convert_32f_aligned16.cc create mode 100644 volk/lib/qa_64f_convert_32f_aligned16.h create mode 100644 volk/lib/qa_64f_convert_32f_unaligned16.cc create mode 100644 volk/lib/qa_64f_convert_32f_unaligned16.h create mode 100644 volk/lib/qa_64f_max_aligned16.cc create mode 100644 volk/lib/qa_64f_max_aligned16.h create mode 100644 volk/lib/qa_64f_min_aligned16.cc create mode 100644 volk/lib/qa_64f_min_aligned16.h create mode 100644 volk/lib/qa_64u_byteswap_aligned16.cc create mode 100644 volk/lib/qa_64u_byteswap_aligned16.h create mode 100644 volk/lib/qa_64u_popcnt_aligned16.cc create mode 100644 volk/lib/qa_64u_popcnt_aligned16.h create mode 100644 volk/lib/qa_8s_convert_16s_aligned16.cc create mode 100644 volk/lib/qa_8s_convert_16s_aligned16.h create mode 100644 volk/lib/qa_8s_convert_16s_unaligned16.cc create mode 100644 volk/lib/qa_8s_convert_16s_unaligned16.h create mode 100644 volk/lib/qa_8s_convert_32f_aligned16.cc create mode 100644 volk/lib/qa_8s_convert_32f_aligned16.h create mode 100644 volk/lib/qa_8s_convert_32f_unaligned16.cc create mode 100644 volk/lib/qa_8s_convert_32f_unaligned16.h create mode 100644 volk/lib/qa_8sc_deinterleave_16s_aligned16.cc create mode 100644 volk/lib/qa_8sc_deinterleave_16s_aligned16.h create mode 100644 volk/lib/qa_8sc_deinterleave_32f_aligned16.cc create mode 100644 volk/lib/qa_8sc_deinterleave_32f_aligned16.h create mode 100644 volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc create mode 100644 volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h create mode 100644 volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc create mode 100644 volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h create mode 100644 volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc create mode 100644 volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h create mode 100644 volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc create mode 100644 volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h create mode 100644 volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc create mode 100644 volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h create mode 100644 volk/lib/qa_volk.cc create mode 100644 volk/lib/qa_volk.h create mode 100644 volk/lib/test_all.cc create mode 100644 volk/lib/volk_rank_archs.c create mode 100644 volk/lib/volk_rank_archs.h create mode 100755 volk/libvector_replace.sh create mode 100644 volk/python/Makefile.am create mode 100644 volk/python/__init__.py create mode 100644 volk/python/libvector.i create mode 100644 volk/python/libvector_square_ff.i create mode 100755 volk/python/qa_square.py create mode 100644 volk/python/run_tests.in create mode 100644 volk/spu_lib/gc_spu_macs.h create mode 100644 volk/spu_lib/spu_16s_cmpgt_unaligned.c create mode 100644 volk/spu_lib/spu_16s_vector_subtract_unaligned.c create mode 100644 volk/spu_lib/spu_16s_vector_sum_unaligned.c create mode 100644 volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c create mode 100644 volk/spu_lib/spu_memcpy_unaligned.c create mode 100644 volk/spu_lib/spu_memset_unaligned.S create mode 100755 volk/system_cleanup.sh create mode 100644 volk/volk.pc.in (limited to 'volk') diff --git a/volk/AUTHORS b/volk/AUTHORS new file mode 100644 index 000000000..e69de29bb diff --git a/volk/COPYING b/volk/COPYING new file mode 100644 index 000000000..94a9ed024 --- /dev/null +++ b/volk/COPYING @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/volk/ChangeLog b/volk/ChangeLog new file mode 100644 index 000000000..e69de29bb diff --git a/volk/INSTALL b/volk/INSTALL new file mode 100644 index 000000000..23e5f25d0 --- /dev/null +++ b/volk/INSTALL @@ -0,0 +1,236 @@ +Installation Instructions +************************* + +Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005 Free +Software Foundation, Inc. + +This file is free documentation; the Free Software Foundation gives +unlimited permission to copy, distribute and modify it. + +Basic Installation +================== + +These are generic installation instructions. + + The `configure' shell script attempts to guess correct values for +various system-dependent variables used during compilation. It uses +those values to create a `Makefile' in each directory of the package. +It may also create one or more `.h' files containing system-dependent +definitions. Finally, it creates a shell script `config.status' that +you can run in the future to recreate the current configuration, and a +file `config.log' containing compiler output (useful mainly for +debugging `configure'). + + It can also use an optional file (typically called `config.cache' +and enabled with `--cache-file=config.cache' or simply `-C') that saves +the results of its tests to speed up reconfiguring. (Caching is +disabled by default to prevent problems with accidental use of stale +cache files.) + + If you need to do unusual things to compile the package, please try +to figure out how `configure' could check whether to do them, and mail +diffs or instructions to the address given in the `README' so they can +be considered for the next release. If you are using the cache, and at +some point `config.cache' contains results you don't want to keep, you +may remove or edit it. + + The file `configure.ac' (or `configure.in') is used to create +`configure' by a program called `autoconf'. You only need +`configure.ac' if you want to change it or regenerate `configure' using +a newer version of `autoconf'. + +The simplest way to compile this package is: + + 1. `cd' to the directory containing the package's source code and type + `./configure' to configure the package for your system. If you're + using `csh' on an old version of System V, you might need to type + `sh ./configure' instead to prevent `csh' from trying to execute + `configure' itself. + + Running `configure' takes awhile. While running, it prints some + messages telling which features it is checking for. + + 2. Type `make' to compile the package. + + 3. Optionally, type `make check' to run any self-tests that come with + the package. + + 4. Type `make install' to install the programs and any data files and + documentation. + + 5. You can remove the program binaries and object files from the + source code directory by typing `make clean'. To also remove the + files that `configure' created (so you can compile the package for + a different kind of computer), type `make distclean'. There is + also a `make maintainer-clean' target, but that is intended mainly + for the package's developers. If you use it, you may have to get + all sorts of other programs in order to regenerate files that came + with the distribution. + +Compilers and Options +===================== + +Some systems require unusual options for compilation or linking that the +`configure' script does not know about. Run `./configure --help' for +details on some of the pertinent environment variables. + + You can give `configure' initial values for configuration parameters +by setting variables in the command line or in the environment. Here +is an example: + + ./configure CC=c89 CFLAGS=-O2 LIBS=-lposix + + *Note Defining Variables::, for more details. + +Compiling For Multiple Architectures +==================================== + +You can compile the package for more than one kind of computer at the +same time, by placing the object files for each architecture in their +own directory. To do this, you must use a version of `make' that +supports the `VPATH' variable, such as GNU `make'. `cd' to the +directory where you want the object files and executables to go and run +the `configure' script. `configure' automatically checks for the +source code in the directory that `configure' is in and in `..'. + + If you have to use a `make' that does not support the `VPATH' +variable, you have to compile the package for one architecture at a +time in the source code directory. After you have installed the +package for one architecture, use `make distclean' before reconfiguring +for another architecture. + +Installation Names +================== + +By default, `make install' installs the package's commands under +`/usr/local/bin', include files under `/usr/local/include', etc. You +can specify an installation prefix other than `/usr/local' by giving +`configure' the option `--prefix=PREFIX'. + + You can specify separate installation prefixes for +architecture-specific files and architecture-independent files. If you +pass the option `--exec-prefix=PREFIX' to `configure', the package uses +PREFIX as the prefix for installing programs and libraries. +Documentation and other data files still use the regular prefix. + + In addition, if you use an unusual directory layout you can give +options like `--bindir=DIR' to specify different values for particular +kinds of files. Run `configure --help' for a list of the directories +you can set and what kinds of files go in them. + + If the package supports it, you can cause programs to be installed +with an extra prefix or suffix on their names by giving `configure' the +option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. + +Optional Features +================= + +Some packages pay attention to `--enable-FEATURE' options to +`configure', where FEATURE indicates an optional part of the package. +They may also pay attention to `--with-PACKAGE' options, where PACKAGE +is something like `gnu-as' or `x' (for the X Window System). The +`README' should mention any `--enable-' and `--with-' options that the +package recognizes. + + For packages that use the X Window System, `configure' can usually +find the X include and library files automatically, but if it doesn't, +you can use the `configure' options `--x-includes=DIR' and +`--x-libraries=DIR' to specify their locations. + +Specifying the System Type +========================== + +There may be some features `configure' cannot figure out automatically, +but needs to determine by the type of machine the package will run on. +Usually, assuming the package is built to be run on the _same_ +architectures, `configure' can figure that out, but if it prints a +message saying it cannot guess the machine type, give it the +`--build=TYPE' option. TYPE can either be a short name for the system +type, such as `sun4', or a canonical name which has the form: + + CPU-COMPANY-SYSTEM + +where SYSTEM can have one of these forms: + + OS KERNEL-OS + + See the file `config.sub' for the possible values of each field. If +`config.sub' isn't included in this package, then this package doesn't +need to know the machine type. + + If you are _building_ compiler tools for cross-compiling, you should +use the option `--target=TYPE' to select the type of system they will +produce code for. + + If you want to _use_ a cross compiler, that generates code for a +platform different from the build platform, you should specify the +"host" platform (i.e., that on which the generated programs will +eventually be run) with `--host=TYPE'. + +Sharing Defaults +================ + +If you want to set default values for `configure' scripts to share, you +can create a site shell script called `config.site' that gives default +values for variables like `CC', `cache_file', and `prefix'. +`configure' looks for `PREFIX/share/config.site' if it exists, then +`PREFIX/etc/config.site' if it exists. Or, you can set the +`CONFIG_SITE' environment variable to the location of the site script. +A warning: not all `configure' scripts look for a site script. + +Defining Variables +================== + +Variables not defined in a site shell script can be set in the +environment passed to `configure'. However, some packages may run +configure again during the build, and the customized values of these +variables may be lost. In order to avoid this problem, you should set +them in the `configure' command line, using `VAR=value'. For example: + + ./configure CC=/usr/local2/bin/gcc + +causes the specified `gcc' to be used as the C compiler (unless it is +overridden in the site shell script). Here is a another example: + + /bin/bash ./configure CONFIG_SHELL=/bin/bash + +Here the `CONFIG_SHELL=/bin/bash' operand causes subsequent +configuration-related scripts to be executed by `/bin/bash'. + +`configure' Invocation +====================== + +`configure' recognizes the following options to control how it operates. + +`--help' +`-h' + Print a summary of the options to `configure', and exit. + +`--version' +`-V' + Print the version of Autoconf used to generate the `configure' + script, and exit. + +`--cache-file=FILE' + Enable the cache: use and save the results of the tests in FILE, + traditionally `config.cache'. FILE defaults to `/dev/null' to + disable caching. + +`--config-cache' +`-C' + Alias for `--cache-file=config.cache'. + +`--quiet' +`--silent' +`-q' + Do not print messages saying which checks are being made. To + suppress all normal output, redirect it to `/dev/null' (any error + messages will still be shown). + +`--srcdir=DIR' + Look for the package's source code in directory DIR. Usually + `configure' can determine that directory automatically. + +`configure' also accepts some other, not widely useful, options. Run +`configure --help' for more details. + diff --git a/volk/Makefile.am b/volk/Makefile.am new file mode 100644 index 000000000..d2ef3dad4 --- /dev/null +++ b/volk/Makefile.am @@ -0,0 +1,56 @@ +# +# Copyright 2004,2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +ACLOCAL_AMFLAGS = -I config + +include $(top_srcdir)/Makefile.common + +EXTRA_DIST = bootstrap configure config.h.in +SUBDIRS = config include lib +#if USE_PYTHON +#SUBDIRS += python +#endif + +pkgconfigdir = $(libdir)/pkgconfig +pkgconfig_DATA = volk.pc + +distclean-local: + -rm -f config/lv_set_simd_flags.m4 + -rm -rf autom4te.cache + -rm -f config.* + -rm -f depcomp + -rm -f install-sh + -rm -f ltmain.sh + -rm -f py-compile + -rm -f missing + -rm -f volk_config.h.in + -rm -f aclocal.m4 + -rm -f Makefile.in + -rm -f config/lt* + -rm -f config/libtool* + -rm -f apps/Makefile.in + -rm -f config/Makefile.in + -rm -f data/Makefile.in + -rm -f doc/Makefile.in + -rm -f include/Makefile.in + -rm -f lib/Makefile.in + -rm -f python/Makefile.in + -rm -f configure \ No newline at end of file diff --git a/volk/Makefile.common b/volk/Makefile.common new file mode 100644 index 000000000..3b028b147 --- /dev/null +++ b/volk/Makefile.common @@ -0,0 +1,46 @@ +# -*- Makefile -*- +# +# Copyright 2004,2006,2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +ourincludedir = $(includedir)/volk + +# swig includes +ourswigincludedir = $(ourincludedir)/swig + +# Install this stuff in the appropriate subdirectory +# This usually ends up at: +# ${prefix}/lib/python${python_version}/site-packages/libvector + +ourpythondir = $(pythondir)/volk +ourpyexecdir = $(pyexecdir)/volk + + +# swig flags +SWIGPYTHONFLAGS = -fvirtual -python -modern +SWIGGRFLAGS = -I$(GNURADIO_CORE_INCLUDEDIR)/swig -I$(GNURADIO_CORE_INCLUDEDIR) + +# standard defins and includes +STD_DEFINES_AND_INCLUDES=-I$(top_srcdir)/include -I$(top_srcdir)/lib $(GNURADIO_CORE_CPPFLAGS) + +# Don't assume that make predefines $(RM), because BSD make does +# not. We define it now in configure.ac using AM_PATH_PROG, but now +# here have to add a -f to be like GNU make. +RM=$(RM_PROG) -f diff --git a/volk/NEWS b/volk/NEWS new file mode 100644 index 000000000..e69de29bb diff --git a/volk/README b/volk/README new file mode 100644 index 000000000..c21b99220 --- /dev/null +++ b/volk/README @@ -0,0 +1 @@ +Follow the install instructions. After ./configure, read LIBVECTOR_MANUAL. \ No newline at end of file diff --git a/volk/apps/Makefile.am b/volk/apps/Makefile.am new file mode 100644 index 000000000..a51823971 --- /dev/null +++ b/volk/apps/Makefile.am @@ -0,0 +1,27 @@ +# +# Copyright 2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +include $(top_srcdir)/Makefile.common + +# C++ stuff here + +if USE_PYTHON +# python stuff here +endif diff --git a/volk/bootstrap b/volk/bootstrap new file mode 100755 index 000000000..ff239c88c --- /dev/null +++ b/volk/bootstrap @@ -0,0 +1,29 @@ +#!/bin/sh + +# Copyright 2001,2005,2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +rm -fr config.cache autom4te*.cache + +cd include/volk && chmod +x volk_register.py && ./volk_register.py && cd ../.. +aclocal -I config +autoconf +autoheader +libtoolize --automake +automake --add-missing -Wno-portability -Wno-override -Wnone + diff --git a/volk/config/Makefile.am b/volk/config/Makefile.am new file mode 100644 index 000000000..b7dc2c161 --- /dev/null +++ b/volk/config/Makefile.am @@ -0,0 +1,60 @@ +# +# Copyright 2001 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +include $(top_srcdir)/Makefile.common + +# Install m4 macros in this directory +m4datadir = $(datadir)/aclocal + +# List your m4 macros here +m4macros = \ + acx_pthread.m4 \ + bnv_have_qt.m4 \ + cppunit.m4 \ + gr_check_shm_open.m4 \ + gr_lib64.m4 \ + gr_libgnuradio_core_extra_ldflags.m4 \ + gr_no_undefined.m4 \ + gr_omnithread.m4 \ + gr_pwin32.m4 \ + gr_set_md_cpu.m4 \ + lv_configure.m4 \ + gr_subversion.m4 \ + gr_sysv_shm.m4 \ + lf_cc.m4 \ + lf_cxx.m4 \ + lf_warnings.m4 \ + lv_lf_warnings.m4 \ + lf_x11.m4 \ + lv_set_simd_flags.m4 \ + lv_set_lv_arch.m4 \ + mkstemp.m4 \ + onceonly.m4 \ + pkg.m4 \ + gcc_version_workaround.m4 + + + + +# Don't install m4 macros anymore +# m4data_DATA = $(m4macros) + +EXTRA_DIST = $(m4macros) diff --git a/volk/config/acx_pthread.m4 b/volk/config/acx_pthread.m4 new file mode 100644 index 000000000..eb09f5acc --- /dev/null +++ b/volk/config/acx_pthread.m4 @@ -0,0 +1,275 @@ +# =========================================================================== +# http://autoconf-archive.cryp.to/acx_pthread.html +# =========================================================================== +# +# SYNOPSIS +# +# ACX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]) +# +# DESCRIPTION +# +# This macro figures out how to build C programs using POSIX threads. It +# sets the PTHREAD_LIBS output variable to the threads library and linker +# flags, and the PTHREAD_CFLAGS output variable to any special C compiler +# flags that are needed. (The user can also force certain compiler +# flags/libs to be tested by setting these environment variables.) +# +# Also sets PTHREAD_CC to any special C compiler that is needed for +# multi-threaded programs (defaults to the value of CC otherwise). (This +# is necessary on AIX to use the special cc_r compiler alias.) +# +# NOTE: You are assumed to not only compile your program with these flags, +# but also link it with them as well. e.g. you should link with +# $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS +# +# If you are only building threads programs, you may wish to use these +# variables in your default LIBS, CFLAGS, and CC: +# +# LIBS="$PTHREAD_LIBS $LIBS" +# CFLAGS="$CFLAGS $PTHREAD_CFLAGS" +# CC="$PTHREAD_CC" +# +# In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant +# has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to that name +# (e.g. PTHREAD_CREATE_UNDETACHED on AIX). +# +# ACTION-IF-FOUND is a list of shell commands to run if a threads library +# is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it +# is not found. If ACTION-IF-FOUND is not specified, the default action +# will define HAVE_PTHREAD. +# +# Please let the authors know if this macro fails on any platform, or if +# you have any other suggestions or comments. This macro was based on work +# by SGJ on autoconf scripts for FFTW (http://www.fftw.org/) (with help +# from M. Frigo), as well as ac_pthread and hb_pthread macros posted by +# Alejandro Forero Cuervo to the autoconf macro repository. We are also +# grateful for the helpful feedback of numerous users. +# +# LAST MODIFICATION +# +# 2008-04-12 +# +# COPYLEFT +# +# Copyright (c) 2008 Steven G. Johnson +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see . +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Macro Archive. When you make and +# distribute a modified version of the Autoconf Macro, you may extend this +# special exception to the GPL to apply to your modified version as well. + +AC_DEFUN([ACX_PTHREAD], [ +AC_REQUIRE([AC_CANONICAL_HOST]) +AC_LANG_SAVE +AC_LANG_C +acx_pthread_ok=no + +# We used to check for pthread.h first, but this fails if pthread.h +# requires special compiler flags (e.g. on True64 or Sequent). +# It gets checked for in the link test anyway. + +# First of all, check if the user has set any of the PTHREAD_LIBS, +# etcetera environment variables, and if threads linking works using +# them: +if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then + save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + save_LIBS="$LIBS" + LIBS="$PTHREAD_LIBS $LIBS" + AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS]) + AC_TRY_LINK_FUNC(pthread_join, acx_pthread_ok=yes) + AC_MSG_RESULT($acx_pthread_ok) + if test x"$acx_pthread_ok" = xno; then + PTHREAD_LIBS="" + PTHREAD_CFLAGS="" + fi + LIBS="$save_LIBS" + CFLAGS="$save_CFLAGS" +fi + +# We must check for the threads library under a number of different +# names; the ordering is very important because some systems +# (e.g. DEC) have both -lpthread and -lpthreads, where one of the +# libraries is broken (non-POSIX). + +# Create a list of thread flags to try. Items starting with a "-" are +# C compiler flags, and other items are library names, except for "none" +# which indicates that we try without any flags at all, and "pthread-config" +# which is a program returning the flags for the Pth emulation library. + +acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config" + +# The ordering *is* (sometimes) important. Some notes on the +# individual items follow: + +# pthreads: AIX (must check this before -lpthread) +# none: in case threads are in libc; should be tried before -Kthread and +# other compiler flags to prevent continual compiler warnings +# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h) +# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able) +# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread) +# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads) +# -pthreads: Solaris/gcc +# -mthreads: Mingw32/gcc, Lynx/gcc +# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it +# doesn't hurt to check since this sometimes defines pthreads too; +# also defines -D_REENTRANT) +# ... -mt is also the pthreads flag for HP/aCC +# pthread: Linux, etcetera +# --thread-safe: KAI C++ +# pthread-config: use pthread-config program (for GNU Pth library) + +case "${host_cpu}-${host_os}" in + *solaris*) + + # On Solaris (at least, for some versions), libc contains stubbed + # (non-functional) versions of the pthreads routines, so link-based + # tests will erroneously succeed. (We need to link with -pthreads/-mt/ + # -lpthread.) (The stubs are missing pthread_cleanup_push, or rather + # a function called by this macro, so we could check for that, but + # who knows whether they'll stub that too in a future libc.) So, + # we'll just look for -pthreads and -lpthread first: + + acx_pthread_flags="-pthreads pthread -mt -pthread $acx_pthread_flags" + ;; +esac + +if test x"$acx_pthread_ok" = xno; then +for flag in $acx_pthread_flags; do + + case $flag in + none) + AC_MSG_CHECKING([whether pthreads work without any flags]) + ;; + + -*) + AC_MSG_CHECKING([whether pthreads work with $flag]) + PTHREAD_CFLAGS="$flag" + ;; + + pthread-config) + AC_CHECK_PROG(acx_pthread_config, pthread-config, yes, no) + if test x"$acx_pthread_config" = xno; then continue; fi + PTHREAD_CFLAGS="`pthread-config --cflags`" + PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`" + ;; + + *) + AC_MSG_CHECKING([for the pthreads library -l$flag]) + PTHREAD_LIBS="-l$flag" + ;; + esac + + save_LIBS="$LIBS" + save_CFLAGS="$CFLAGS" + LIBS="$PTHREAD_LIBS $LIBS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + + # Check for various functions. We must include pthread.h, + # since some functions may be macros. (On the Sequent, we + # need a special flag -Kthread to make this header compile.) + # We check for pthread_join because it is in -lpthread on IRIX + # while pthread_create is in libc. We check for pthread_attr_init + # due to DEC craziness with -lpthreads. We check for + # pthread_cleanup_push because it is one of the few pthread + # functions on Solaris that doesn't have a non-functional libc stub. + # We try pthread_create on general principles. + AC_TRY_LINK([#include ], + [pthread_t th; pthread_join(th, 0); + pthread_attr_init(0); pthread_cleanup_push(0, 0); + pthread_create(0,0,0,0); pthread_cleanup_pop(0); ], + [acx_pthread_ok=yes]) + + LIBS="$save_LIBS" + CFLAGS="$save_CFLAGS" + + AC_MSG_RESULT($acx_pthread_ok) + if test "x$acx_pthread_ok" = xyes; then + break; + fi + + PTHREAD_LIBS="" + PTHREAD_CFLAGS="" +done +fi + +# Various other checks: +if test "x$acx_pthread_ok" = xyes; then + save_LIBS="$LIBS" + LIBS="$PTHREAD_LIBS $LIBS" + save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + + # Detect AIX lossage: JOINABLE attribute is called UNDETACHED. + AC_MSG_CHECKING([for joinable pthread attribute]) + attr_name=unknown + for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do + AC_TRY_LINK([#include ], [int attr=$attr; return attr;], + [attr_name=$attr; break]) + done + AC_MSG_RESULT($attr_name) + if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then + AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE, $attr_name, + [Define to necessary symbol if this constant + uses a non-standard name on your system.]) + fi + + AC_MSG_CHECKING([if more special flags are required for pthreads]) + flag=no + case "${host_cpu}-${host_os}" in + *-aix* | *-freebsd* | *-darwin*) flag="-D_THREAD_SAFE";; + *solaris* | *-osf* | *-hpux*) flag="-D_REENTRANT";; + esac + AC_MSG_RESULT(${flag}) + if test "x$flag" != xno; then + PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS" + fi + + LIBS="$save_LIBS" + CFLAGS="$save_CFLAGS" + + # More AIX lossage: must compile with xlc_r or cc_r + if test x"$GCC" != xyes; then + AC_CHECK_PROGS(PTHREAD_CC, xlc_r cc_r, ${CC}) + else + PTHREAD_CC=$CC + fi +else + PTHREAD_CC="$CC" +fi + +AC_SUBST(PTHREAD_LIBS) +AC_SUBST(PTHREAD_CFLAGS) +AC_SUBST(PTHREAD_CC) + +# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: +if test x"$acx_pthread_ok" = xyes; then + ifelse([$1],,AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]),[$1]) + : +else + acx_pthread_ok=no + $2 +fi +AC_LANG_RESTORE +])dnl ACX_PTHREAD diff --git a/volk/config/bnv_have_qt.m4 b/volk/config/bnv_have_qt.m4 new file mode 100644 index 000000000..1469bfbfd --- /dev/null +++ b/volk/config/bnv_have_qt.m4 @@ -0,0 +1,404 @@ +dnl Available from the GNU Autoconf Macro Archive at: +dnl http://www.gnu.org/software/ac-archive/htmldoc/bnv_have_qt.html +dnl +AC_DEFUN([BNV_HAVE_QT], +[ + dnl THANKS! This code includes bug fixes by: + dnl Tim McClarren. + + AC_REQUIRE([AC_PROG_CXX]) + AC_REQUIRE([AC_PATH_X]) + AC_REQUIRE([AC_PATH_XTRA]) + + AC_MSG_CHECKING(for Qt) + + AC_ARG_WITH([Qt-dir], + [ --with-Qt-dir=DIR DIR is equal to \$QTDIR if you have followed the + installation instructions of Trolltech. Header + files are in DIR/include, binary utilities are + in DIR/bin and the library is in DIR/lib]) + AC_ARG_WITH([Qt-include-dir], + [ --with-Qt-include-dir=DIR + Qt header files are in DIR]) + AC_ARG_WITH([Qt-bin-dir], + [ --with-Qt-bin-dir=DIR Qt utilities such as moc and uic are in DIR]) + AC_ARG_WITH([Qt-lib-dir], + [ --with-Qt-lib-dir=DIR The Qt library is in DIR]) + AC_ARG_WITH([Qt-lib], + [ --with-Qt-lib=LIB Use -lLIB to link with the Qt library]) + if test x"$with_Qt_dir" = x"no" || + test x"$with_Qt_include-dir" = x"no" || + test x"$with_Qt_bin_dir" = x"no" || + test x"$with_Qt_lib_dir" = x"no" || + test x"$with_Qt_lib" = x"no"; then + # user disabled Qt. Leave cache alone. + have_qt="User disabled Qt." + else + # "yes" is a bogus option + if test x"$with_Qt_dir" = xyes; then + with_Qt_dir= + fi + if test x"$with_Qt_include_dir" = xyes; then + with_Qt_include_dir= + fi + if test x"$with_Qt_bin_dir" = xyes; then + with_Qt_bin_dir= + fi + if test x"$with_Qt_lib_dir" = xyes; then + with_Qt_lib_dir= + fi + if test x"$with_Qt_lib" = xyes; then + with_Qt_lib= + fi + # No Qt unless we discover otherwise + have_qt=no + # Check whether we are requested to link with a specific version + if test x"$with_Qt_lib" != x; then + bnv_qt_lib="$with_Qt_lib" + fi + # Check whether we were supplied with an answer already + if test x"$with_Qt_dir" != x; then + have_qt=yes + bnv_qt_dir="$with_Qt_dir" + bnv_qt_include_dir="$with_Qt_dir/include" + bnv_qt_bin_dir="$with_Qt_dir/bin" + bnv_qt_lib_dir="$with_Qt_dir/lib" + # Only search for the lib if the user did not define one already + if test x"$bnv_qt_lib" = x; then + bnv_qt_lib="`ls $bnv_qt_lib_dir/libqt* | sed -n 1p | + sed s@$bnv_qt_lib_dir/lib@@ | [sed s@[.].*@@]`" + fi + bnv_qt_LIBS="-L$bnv_qt_lib_dir -l$bnv_qt_lib $X_PRE_LIBS $X_LIBS -lX11 -lXext -lXmu -lXt -lXi $X_EXTRA_LIBS" + else + # Use cached value or do search, starting with suggestions from + # the command line + AC_CACHE_VAL(bnv_cv_have_qt, + [ + # We are not given a solution and there is no cached value. + bnv_qt_dir=NO + bnv_qt_include_dir=NO + bnv_qt_lib_dir=NO + if test x"$bnv_qt_lib" = x; then + bnv_qt_lib=NO + fi + BNV_PATH_QT_DIRECT + if test "$bnv_qt_dir" = NO || + test "$bnv_qt_include_dir" = NO || + test "$bnv_qt_lib_dir" = NO || + test "$bnv_qt_lib" = NO; then + # Problem with finding complete Qt. Cache the known absence of Qt. + bnv_cv_have_qt="have_qt=no" + else + # Record where we found Qt for the cache. + bnv_cv_have_qt="have_qt=yes \ + bnv_qt_dir=$bnv_qt_dir \ + bnv_qt_include_dir=$bnv_qt_include_dir \ + bnv_qt_bin_dir=$bnv_qt_bin_dir \ + bnv_qt_LIBS=\"$bnv_qt_LIBS\"" + fi + ])dnl + eval "$bnv_cv_have_qt" + fi # all $bnv_qt_* are set + fi # $have_qt reflects the system status + if test x"$have_qt" = xyes; then + QT_CXXFLAGS="-I$bnv_qt_include_dir" + QT_DIR="$bnv_qt_dir" + QT_LIBS="$bnv_qt_LIBS" + # If bnv_qt_dir is defined, utilities are expected to be in the + # bin subdirectory + if test x"$bnv_qt_dir" != x; then + if test -x "$bnv_qt_dir/bin/uic"; then + QT_UIC="$bnv_qt_dir/bin/uic" + else + # Old versions of Qt don't have uic + QT_UIC= + fi + QT_MOC="$bnv_qt_dir/bin/moc" + else + # Or maybe we are told where to look for the utilities + if test x"$bnv_qt_bin_dir" != x; then + if test -x "$bnv_qt_bin_dir/uic"; then + QT_UIC="$bnv_qt_bin_dir/uic" + else + # Old versions of Qt don't have uic + QT_UIC= + fi + QT_MOC="$bnv_qt_bin_dir/moc" + else + # Last possibility is that they are in $PATH + QT_UIC="`which uic`" + QT_MOC="`which moc`" + fi + fi + # All variables are defined, report the result + AC_MSG_RESULT([$have_qt: + QT_CXXFLAGS=$QT_CXXFLAGS + QT_DIR=$QT_DIR + QT_LIBS=$QT_LIBS + QT_UIC=$QT_UIC + QT_MOC=$QT_MOC]) + else + # Qt was not found + QT_CXXFLAGS= + QT_DIR= + QT_LIBS= + QT_UIC= + QT_MOC= + AC_MSG_RESULT($have_qt) + fi + AC_SUBST(QT_CXXFLAGS) + AC_SUBST(QT_DIR) + AC_SUBST(QT_LIBS) + AC_SUBST(QT_UIC) + AC_SUBST(QT_MOC) + + #### Being paranoid: + if test x"$have_qt" = xyes; then + AC_MSG_CHECKING(correct functioning of Qt installation) + AC_CACHE_VAL(bnv_cv_qt_test_result, + [ + cat > bnv_qt_test.h << EOF +#include +class Test : public QObject +{ +Q_OBJECT +public: + Test() {} + ~Test() {} +public slots: + void receive() {} +signals: + void send(); +}; +EOF + + cat > bnv_qt_main.$ac_ext << EOF +#include "bnv_qt_test.h" +#include +int main( int argc, char **argv ) +{ + QApplication app( argc, argv ); + Test t; + QObject::connect( &t, SIGNAL(send()), &t, SLOT(receive()) ); +} +EOF + + bnv_cv_qt_test_result="failure" + bnv_try_1="$QT_MOC bnv_qt_test.h -o moc_bnv_qt_test.$ac_ext >/dev/null 2>bnv_qt_test_1.out" + AC_TRY_EVAL(bnv_try_1) + bnv_err_1=`grep -v '^ *+' bnv_qt_test_1.out | grep -v "^bnv_qt_test.h\$"` + if test x"$bnv_err_1" != x; then + echo "$bnv_err_1" >&AC_FD_CC + echo "configure: could not run $QT_MOC on:" >&AC_FD_CC + cat bnv_qt_test.h >&AC_FD_CC + else + bnv_try_2="$CXX $QT_CXXFLAGS -c $CXXFLAGS -o moc_bnv_qt_test.o moc_bnv_qt_test.$ac_ext >/dev/null 2>bnv_qt_test_2.out" + AC_TRY_EVAL(bnv_try_2) + bnv_err_2=`grep -v '^ *+' bnv_qt_test_2.out | grep -v "^bnv_qt_test.{$ac_ext}\$"` + if test x"$bnv_err_2" != x; then + echo "$bnv_err_2" >&AC_FD_CC + echo "configure: could not compile:" >&AC_FD_CC + cat bnv_qt_test.$ac_ext >&AC_FD_CC + else + bnv_try_3="$CXX $QT_CXXFLAGS -c $CXXFLAGS -o bnv_qt_main.o bnv_qt_main.$ac_ext >/dev/null 2>bnv_qt_test_3.out" + AC_TRY_EVAL(bnv_try_3) + bnv_err_3=`grep -v '^ *+' bnv_qt_test_3.out | grep -v "^bnv_qt_main.{$ac_ext}\$"` + if test x"$bnv_err_3" != x; then + echo "$bnv_err_3" >&AC_FD_CC + echo "configure: could not compile:" >&AC_FD_CC + cat bnv_qt_main.$ac_ext >&AC_FD_CC + else + bnv_try_4="$CXX $QT_LIBS $LIBS -o bnv_qt_main bnv_qt_main.o moc_bnv_qt_test.o >/dev/null 2>bnv_qt_test_4.out" + AC_TRY_EVAL(bnv_try_4) + bnv_err_4=`grep -v '^ *+' bnv_qt_test_4.out` + if test x"$bnv_err_4" != x; then + echo "$bnv_err_4" >&AC_FD_CC + else + bnv_cv_qt_test_result="success" + fi + fi + fi + fi + ])dnl AC_CACHE_VAL bnv_cv_qt_test_result + AC_MSG_RESULT([$bnv_cv_qt_test_result]); + if test x"$bnv_cv_qt_test_result" = "xfailure"; then + # working Qt was not found + QT_CXXFLAGS= + QT_DIR= + QT_LIBS= + QT_UIC= + QT_MOC= + have_qt=no + AC_MSG_WARN([Failed to find matching components of a complete + Qt installation. Try using more options, + see ./configure --help.]) + fi + + rm -f bnv_qt_test.h moc_bnv_qt_test.$ac_ext moc_bnv_qt_test.o \ + bnv_qt_main.$ac_ext bnv_qt_main.o bnv_qt_main \ + bnv_qt_test_1.out bnv_qt_test_2.out bnv_qt_test_3.out bnv_qt_test_4.out + fi +]) + +dnl Internal subroutine of BNV_HAVE_QT +dnl Set bnv_qt_dir bnv_qt_include_dir bnv_qt_bin_dir bnv_qt_lib_dir bnv_qt_lib +dnl Copyright 2001 Bastiaan N. Veelo +AC_DEFUN([BNV_PATH_QT_DIRECT], +[ + ## Binary utilities ## + if test x"$with_Qt_bin_dir" != x; then + bnv_qt_bin_dir=$with_Qt_bin_dir + fi + ## Look for header files ## + if test x"$with_Qt_include_dir" != x; then + bnv_qt_include_dir="$with_Qt_include_dir" + else + # The following header file is expected to define QT_VERSION. + qt_direct_test_header=qglobal.h + # Look for the header file in a standard set of common directories. + bnv_include_path_list=" + /usr/include + `ls -dr /usr/include/qt* 2>/dev/null` + `ls -dr /usr/lib/qt*/include 2>/dev/null` + `ls -dr /usr/local/qt*/include 2>/dev/null` + `ls -dr /opt/qt*/include 2>/dev/null` + " + for bnv_dir in $bnv_include_path_list; do + if test -r "$bnv_dir/$qt_direct_test_header"; then + bnv_dirs="$bnv_dirs $bnv_dir" + fi + done + # Now look for the newest in this list + bnv_prev_ver=0 + for bnv_dir in $bnv_dirs; do + bnv_this_ver=`egrep -w '#define QT_VERSION' $bnv_dir/$qt_direct_test_header | sed s/'#define QT_VERSION'//` + if expr $bnv_this_ver '>' $bnv_prev_ver > /dev/null; then + bnv_qt_include_dir=$bnv_dir + bnv_prev_ver=$bnv_this_ver + fi + done + fi dnl Found header files. + + # Are these headers located in a traditional Trolltech installation? + # That would be $bnv_qt_include_dir stripped from its last element: + bnv_possible_qt_dir=`dirname $bnv_qt_include_dir` + if test -x $bnv_possible_qt_dir/bin/moc && + ls $bnv_possible_qt_dir/lib/libqt* > /dev/null; then + # Then the rest is a piece of cake + bnv_qt_dir=$bnv_possible_qt_dir + bnv_qt_bin_dir="$bnv_qt_dir/bin" + bnv_qt_lib_dir="$bnv_qt_dir/lib" + # Only look for lib if the user did not supply it already + if test x"$bnv_qt_lib" = xNO; then + bnv_qt_lib="`ls $bnv_qt_lib_dir/libqt* | sed -n 1p | + sed s@$bnv_qt_lib_dir/lib@@ | [sed s@[.].*@@]`" + fi + bnv_qt_LIBS="-L$bnv_qt_lib_dir -l$bnv_qt_lib $X_PRE_LIBS $X_LIBS -lX11 -lXext -lXmu -lXt -lXi $X_EXTRA_LIBS" + else + # There is no valid definition for $QTDIR as Trolltech likes to see it + bnv_qt_dir= + ## Look for Qt library ## + if test x"$with_Qt_lib_dir" != x; then + bnv_qt_lib_dir="$with_Qt_lib_dir" + # Only look for lib if the user did not supply it already + if test x"$bnv_qt_lib" = xNO; then + bnv_qt_lib="`ls $bnv_qt_lib_dir/libqt* | sed -n 1p | + sed s@$bnv_qt_lib_dir/lib@@ | [sed s@[.].*@@]`" + fi + bnv_qt_LIBS="-L$bnv_qt_lib_dir -l$bnv_qt_lib $X_PRE_LIBS $X_LIBS -lX11 -lXext -lXmu -lXt -lXi $X_EXTRA_LIBS" + else + # Normally, when there is no traditional Trolltech installation, + # the library is installed in a place where the linker finds it + # automatically. + # If the user did not define the library name, try with qt + if test x"$bnv_qt_lib" = xNO; then + bnv_qt_lib=qt + fi + qt_direct_test_header=qapplication.h + qt_direct_test_main=" + int argc; + char ** argv; + QApplication app(argc,argv); + " + # See if we find the library without any special options. + # Don't add top $LIBS permanently yet + bnv_save_LIBS="$LIBS" + LIBS="-l$bnv_qt_lib $X_PRE_LIBS $X_LIBS -lX11 -lXext -lXmu -lXt -lXi $X_EXTRA_LIBS" + bnv_qt_LIBS="$LIBS" + bnv_save_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="-I$bnv_qt_include_dir" + AC_TRY_LINK([#include <$qt_direct_test_header>], + $qt_direct_test_main, + [ + # Success. + # We can link with no special library directory. + bnv_qt_lib_dir= + ], [ + # That did not work. Try the multi-threaded version + echo "Non-critical error, please neglect the above." >&AC_FD_CC + bnv_qt_lib=qt-mt + LIBS="-l$bnv_qt_lib $X_PRE_LIBS $X_LIBS -lX11 -lXext -lXmu -lXt -lXi $X_EXTRA_LIBS" + AC_TRY_LINK([#include <$qt_direct_test_header>], + $qt_direct_test_main, + [ + # Success. + # We can link with no special library directory. + bnv_qt_lib_dir= + ], [ + # That did not work. Try the OpenGL version + echo "Non-critical error, please neglect the above." >&AC_FD_CC + bnv_qt_lib=qt-gl + LIBS="-l$bnv_qt_lib $X_PRE_LIBS $X_LIBS -lX11 -lXext -lXmu -lXt -lXi $X_EXTRA_LIBS" + AC_TRY_LINK([#include <$qt_direct_test_header>], + $qt_direct_test_main, + [ + # Succes. + # We can link with no special library directory. + bnv_qt_lib_dir= + ], [ + # That did not work. Maybe a library version I don't know about? + echo "Non-critical error, please neglect the above." >&AC_FD_CC + # Look for some Qt lib in a standard set of common directories. + bnv_dir_list=" + `echo $bnv_qt_includes | sed ss/includess` + /lib + /usr/lib + /usr/local/lib + /opt/lib + `ls -dr /usr/lib/qt* 2>/dev/null` + `ls -dr /usr/local/qt* 2>/dev/null` + `ls -dr /opt/qt* 2>/dev/null` + " + for bnv_dir in $bnv_dir_list; do + if ls $bnv_dir/libqt*; then + # Gamble that it's the first one... + bnv_qt_lib="`ls $bnv_dir/libqt* | sed -n 1p | + sed s@$bnv_dir/lib@@ | sed s/[.].*//`" + bnv_qt_lib_dir="$bnv_dir" + break + fi + done + # Try with that one + LIBS="-l$bnv_qt_lib $X_PRE_LIBS $X_LIBS -lX11 -lXext -lXmu -lXt -lXi $X_EXTRA_LIBS" + AC_TRY_LINK([#include <$qt_direct_test_header>], + $qt_direct_test_main, + [ + # Succes. + # We can link with no special library directory. + bnv_qt_lib_dir= + ], [ + # Leave bnv_qt_lib_dir defined + ]) + ]) + ]) + ]) + if test x"$bnv_qt_lib_dir" != x; then + bnv_qt_LIBS="-l$bnv_qt_lib_dir $LIBS" + else + bnv_qt_LIBS="$LIBS" + fi + LIBS="$bnv_save_LIBS" + CXXFLAGS="$bnv_save_CXXFLAGS" + fi dnl $with_Qt_lib_dir was not given + fi dnl Done setting up for non-traditional Trolltech installation +]) diff --git a/volk/config/cppunit.m4 b/volk/config/cppunit.m4 new file mode 100644 index 000000000..0991d51ec --- /dev/null +++ b/volk/config/cppunit.m4 @@ -0,0 +1,80 @@ +dnl +dnl AM_PATH_CPPUNIT(MINIMUM-VERSION, [ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]]) +dnl +AC_DEFUN([AM_PATH_CPPUNIT], +[ + +AC_ARG_WITH(cppunit-prefix,[ --with-cppunit-prefix=PFX Prefix where CppUnit is installed (optional)], + cppunit_config_prefix="$withval", cppunit_config_prefix="") +AC_ARG_WITH(cppunit-exec-prefix,[ --with-cppunit-exec-prefix=PFX Exec prefix where CppUnit is installed (optional)], + cppunit_config_exec_prefix="$withval", cppunit_config_exec_prefix="") + + if test x$cppunit_config_exec_prefix != x ; then + cppunit_config_args="$cppunit_config_args --exec-prefix=$cppunit_config_exec_prefix" + if test x${CPPUNIT_CONFIG+set} != xset ; then + CPPUNIT_CONFIG=$cppunit_config_exec_prefix/bin/cppunit-config + fi + fi + if test x$cppunit_config_prefix != x ; then + cppunit_config_args="$cppunit_config_args --prefix=$cppunit_config_prefix" + if test x${CPPUNIT_CONFIG+set} != xset ; then + CPPUNIT_CONFIG=$cppunit_config_prefix/bin/cppunit-config + fi + fi + + AC_PATH_PROG(CPPUNIT_CONFIG, cppunit-config, no) + cppunit_version_min=$1 + + AC_MSG_CHECKING(for Cppunit - version >= $cppunit_version_min) + no_cppunit="" + if test "$CPPUNIT_CONFIG" = "no" ; then + no_cppunit=yes + else + CPPUNIT_CFLAGS=`$CPPUNIT_CONFIG --cflags` + CPPUNIT_LIBS=`$CPPUNIT_CONFIG --libs` + cppunit_version=`$CPPUNIT_CONFIG --version` + + cppunit_major_version=`echo $cppunit_version | \ + sed 's/\([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\)/\1/'` + cppunit_minor_version=`echo $cppunit_version | \ + sed 's/\([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\)/\2/'` + cppunit_micro_version=`echo $cppunit_version | \ + sed 's/\([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\)/\3/'` + + cppunit_major_min=`echo $cppunit_version_min | \ + sed 's/\([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\)/\1/'` + cppunit_minor_min=`echo $cppunit_version_min | \ + sed 's/\([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\)/\2/'` + cppunit_micro_min=`echo $cppunit_version_min | \ + sed 's/\([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\)/\3/'` + + cppunit_version_proper=`expr \ + $cppunit_major_version \> $cppunit_major_min \| \ + $cppunit_major_version \= $cppunit_major_min \& \ + $cppunit_minor_version \> $cppunit_minor_min \| \ + $cppunit_major_version \= $cppunit_major_min \& \ + $cppunit_minor_version \= $cppunit_minor_min \& \ + $cppunit_micro_version \>= $cppunit_micro_min ` + + if test "$cppunit_version_proper" = "1" ; then + AC_MSG_RESULT([$cppunit_major_version.$cppunit_minor_version.$cppunit_micro_version]) + else + AC_MSG_RESULT(no) + no_cppunit=yes + fi + fi + + if test "x$no_cppunit" = x ; then + ifelse([$2], , :, [$2]) + else + CPPUNIT_CFLAGS="" + CPPUNIT_LIBS="" + ifelse([$3], , :, [$3]) + fi + + AC_SUBST(CPPUNIT_CFLAGS) + AC_SUBST(CPPUNIT_LIBS) +]) + + + diff --git a/volk/config/gcc_version_workaround.m4 b/volk/config/gcc_version_workaround.m4 new file mode 100644 index 000000000..b3ba0b6f3 --- /dev/null +++ b/volk/config/gcc_version_workaround.m4 @@ -0,0 +1,49 @@ +AC_DEFUN([LV_GCC_VERSION_WORKAROUND], +[ + AC_REQUIRE([LF_CONFIGURE_CXX]) + + cxx_version=`$CXX --version` + + cxx_major_version=`echo $cxx_version | sed 's/[[^)]]*) \([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\).*/\1/'` + cxx_minor_version=`echo $cxx_version | sed 's/g++ [[^)]]*) \([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\).*/\2/'` + cxx_micro_version=`echo $cxx_version | sed 's/g++ [[^)]]*) \([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\).*/\3/'` + + if test ["$cxx_minor_version" -lt "3"] -o ["$cxx_major_version" -lt "4"]; then + cxx_proper_version="$cxx_major_version.$cxx_minor_version.$cxx_micro_version" + + + my_arch=`uname -m` + + if test "${my_arch}" = i686; then + my_arch="${my_arch} i586 i486 i386" + fi + if test "${my_arch}" = i586; then + my_arch = "${my_arch} i686 i386 i486" + fi + if test "${my_arch}" = i486; then + my_arch = "${my_arch} i686 i386 i586" + fi + if test "${my_arch}" = i386; then + my_arch = "${my_arch} i686 i586 i486" + fi + + for i in $my_arch + do + if test -n "`ls /usr/include/c++/$cxx_proper_version | grep $i`"; then + mystery_dir=`ls /usr/include/c++/$cxx_proper_version | grep $i` + echo `ls /usr/include/c++/$cxx_proper_version | grep $i` + fi + done + echo "${mystery_dir}" + + LV_CXXFLAGS="${LV_CXXFLAGS} -nostdinc++ -I/usr/include/c++/$cxx_proper_version -I/usr/include/c++/$cxx_proper_version/$mystery_dir" + + CXXFLAGS="${CXXFLAGS} -nostdinc++ -I/usr/include/c++/$cxx_proper_version -I/usr/include/c++/$cxx_proper_version/$mystery_dir" + + + + fi + + + + ]) \ No newline at end of file diff --git a/volk/config/gr_lib64.m4 b/volk/config/gr_lib64.m4 new file mode 100644 index 000000000..751f774b4 --- /dev/null +++ b/volk/config/gr_lib64.m4 @@ -0,0 +1,85 @@ +dnl +dnl Copyright 2005,2008 Free Software Foundation, Inc. +dnl +dnl This file is part of GNU Radio +dnl +dnl GNU Radio is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU General Public License as published by +dnl the Free Software Foundation; either version 3, or (at your option) +dnl any later version. +dnl +dnl GNU Radio is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +dnl GNU General Public License for more details. +dnl +dnl You should have received a copy of the GNU General Public License +dnl along with GNU Radio; see the file COPYING. If not, write to +dnl the Free Software Foundation, Inc., 51 Franklin Street, +dnl Boston, MA 02110-1301, USA. +dnl + +dnl GR_LIB64() +dnl +dnl Checks to see if we're on a x86_64 or powerpc64 machine, and if so, determine +dnl if libdir should end in "64" or not. +dnl +dnl Sets gr_libdir_suffix to "" or "64" and calls AC_SUBST(gr_libdir_suffix) +dnl May append "64" to libdir. +dnl +dnl The current heuristic is: +dnl if the host_cpu isn't x86_64 or powerpc64, then "" +dnl if the host_os isn't linux, then "" +dnl if we're cross-compiling, ask the linker, by way of the selected compiler +dnl if we're x86_64 and there's a /lib64 and it's not a symlink, then "64", else "" +dnl else ask the compiler +dnl +AC_DEFUN([GR_LIB64],[ + AC_REQUIRE([AC_CANONICAL_HOST]) + AC_REQUIRE([AC_PROG_CXX]) + + AC_MSG_CHECKING([gr_libdir_suffix]) + gr_libdir_suffix="" + AC_SUBST(gr_libdir_suffix) + + case "$host_os" in + linux*) is_linux=yes ;; + *) is_linux=no ;; + esac + + if test "$is_linux" = no || test "$host_cpu" != "x86_64" && test "$host_cpu" != "powerpc64"; then + gr_libdir_suffix="" + elif test "$cross_compiling" = yes; then + _GR_LIB64_ASK_COMPILER + elif test "$host_cpu" = "x86_64"; then + if test -d /lib64 && test ! -L /lib64; then + gr_libdir_suffix=64 + fi + else + _GR_LIB64_ASK_COMPILER + fi + AC_MSG_RESULT([$gr_libdir_suffix]) + + + AC_MSG_CHECKING([whether to append 64 to libdir]) + t=${libdir##*/lib} + if test "$t" != 64 && test "$gr_libdir_suffix" = "64"; then + libdir=${libdir}64 + AC_MSG_RESULT([yes. Setting libdir to $libdir]) + else + AC_MSG_RESULT([no]) + fi +]) + +dnl If we're using g++, extract the first SEARCH_DIR("...") entry from the linker script +dnl and see if it contains a suffix after the final .../lib part of the path. +dnl (This works because the linker script varies depending on whether we're generating +dnl 32-bit or 64-bit executables) +dnl +AC_DEFUN([_GR_LIB64_ASK_COMPILER],[ + if test "$ac_cv_cxx_compiler_gnu" = "yes"; + then + gr_libdir_suffix=`$CXX -Wl,--verbose 2>/dev/null | sed -n -e '/SEARCH_DIR/{s/;.*$//; s,^.*/,,; s/".*$//; s/^lib//; p}'` + fi +]) + diff --git a/volk/config/gr_libgnuradio_core_extra_ldflags.m4 b/volk/config/gr_libgnuradio_core_extra_ldflags.m4 new file mode 100644 index 000000000..43f872c04 --- /dev/null +++ b/volk/config/gr_libgnuradio_core_extra_ldflags.m4 @@ -0,0 +1,40 @@ +# Check for (MinGW)win32 extra ld options. -*- Autoconf -*- + +# Copyright 2003,2004,2005 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. + +dnl +AC_DEFUN([GR_LIBGNURADIO_CORE_EXTRA_LDFLAGS], [ +AC_REQUIRE([AC_PROG_LD]) +# on Mingw32 extra LDFLAGS are required to ease global variable linking +LIBGNURADIO_CORE_EXTRA_LDFLAGS="" + +AC_MSG_CHECKING([whether $LD accepts --enable-runtime-pseudo-reloc]) +if ${LD} --enable-runtime-pseudo-reloc --version >/dev/null 2>&1 +then + # libtool requires the quotes + LIBGNURADIO_CORE_EXTRA_LDFLAGS="\"-Wl,--enable-runtime-pseudo-reloc\"" + AC_MSG_RESULT(yes) +else + AC_MSG_RESULT(no) +fi + +AC_SUBST(LIBGNURADIO_CORE_EXTRA_LDFLAGS) + +]) diff --git a/volk/config/gr_no_undefined.m4 b/volk/config/gr_no_undefined.m4 new file mode 100644 index 000000000..c8d745d5f --- /dev/null +++ b/volk/config/gr_no_undefined.m4 @@ -0,0 +1,44 @@ +dnl +dnl Copyright 2005 Free Software Foundation, Inc. +dnl +dnl This file is part of GNU Radio +dnl +dnl GNU Radio is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU General Public License as published by +dnl the Free Software Foundation; either version 3, or (at your option) +dnl any later version. +dnl +dnl GNU Radio is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +dnl GNU General Public License for more details. +dnl +dnl You should have received a copy of the GNU General Public License +dnl along with GNU Radio; see the file COPYING. If not, write to +dnl the Free Software Foundation, Inc., 51 Franklin Street, +dnl Boston, MA 02110-1301, USA. +dnl + +# GR_NO_UNDEFINED() +# +# Detemine whether we need to use the -no-undefined linker flag +# when building shared libraries. +# Sets NO_UNDEFINED to "" or "-no-undefined" +# +# As far as I can tell, we need -no-undefined only when building +# windows DLLs. This occurs when using MinGW and Cygwin. +# +# For now, we stub this out. + +AC_DEFUN([GR_NO_UNDEFINED],[ + AC_REQUIRE([AC_CANONICAL_HOST]) + no_undefined="" + case "${host_os}" in + *mingw* | *cygwin*) + + # on MinGW/Cygwin extra LDFLAGS are required + no_undefined="-no-undefined" + ;; + esac + AC_SUBST(NO_UNDEFINED,[$no_undefined]) +]) diff --git a/volk/config/gr_omnithread.m4 b/volk/config/gr_omnithread.m4 new file mode 100644 index 000000000..054f07824 --- /dev/null +++ b/volk/config/gr_omnithread.m4 @@ -0,0 +1,52 @@ +# Check for Omnithread (pthread/NT) thread support. -*- Autoconf -*- + +# Copyright 2003,2007 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Boston, MA +# 02110-1301, USA. + +AC_DEFUN([GR_OMNITHREAD], +[ + # Check first for POSIX + ACX_PTHREAD( + [ AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]) + ot_posix="yes" + DEFINES="$DEFINES -DOMNITHREAD_POSIX=1" + ],[ + # If no POSIX support found, then check for NT threads + AC_MSG_CHECKING([for NT threads]) + + AC_LINK_IFELSE([ + #include + #include + int main() { InitializeCriticalSection(NULL); return 0; } + ], + [ + ot_nt="yes" + DEFINES="$DEFINES -DOMNITHREAD_NT=1" + ], + [AC_MSG_FAILURE([GNU Radio requires POSIX threads. pthreads not found.])] + ) + AC_MSG_RESULT(yes) + ]) + AM_CONDITIONAL(OMNITHREAD_POSIX, test "x$ot_posix" = xyes) + AM_CONDITIONAL(OMNITHREAD_NT, test "x$ot_nt" = xyes) + + save_LIBS="$LIBS" + AC_SEARCH_LIBS([clock_gettime], [rt], [PTHREAD_LIBS="$PTHREAD_LIBS $LIBS"]) + AC_CHECK_FUNCS([clock_gettime gettimeofday nanosleep]) + LIBS="$save_LIBS" +]) + diff --git a/volk/config/gr_pwin32.m4 b/volk/config/gr_pwin32.m4 new file mode 100644 index 000000000..7b99cba6b --- /dev/null +++ b/volk/config/gr_pwin32.m4 @@ -0,0 +1,146 @@ +# Check for (mingw)win32 POSIX replacements. -*- Autoconf -*- + +# Copyright 2003,2004,2005 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. + + +AC_DEFUN([GR_PWIN32], +[ +AC_REQUIRE([AC_HEADER_TIME]) +AC_CHECK_HEADERS([sys/types.h fcntl.h io.h]) +AC_CHECK_HEADERS([windows.h]) +AC_CHECK_HEADERS([winioctl.h winbase.h], [], [], [ + #if HAVE_WINDOWS_H + #include + #endif +]) + +AC_CHECK_FUNCS([getopt usleep gettimeofday nanosleep rand srand random srandom sleep sigaction]) +AC_CHECK_TYPES([struct timezone, struct timespec, ssize_t],[],[],[ + #if HAVE_SYS_TYPES_H + # include + #endif + #if TIME_WITH_SYS_TIME + # include + # include + #else + # if HAVE_SYS_TIME_H + # include + # else + # include + # endif + #endif +]) + +dnl Checks for replacements +AC_REPLACE_FUNCS([getopt usleep gettimeofday]) + + +AC_MSG_CHECKING(for Sleep) +AC_TRY_LINK([ #include + #include + ], [ Sleep(0); ], + [AC_DEFINE(HAVE_SSLEEP,1,[Define to 1 if you have win32 Sleep]) + AC_MSG_RESULT(yes)], + AC_MSG_RESULT(no) + ) + +dnl Under Win32, mkdir prototype in io.h has only one arg +AC_MSG_CHECKING(whether mkdir accepts only one arg) +AC_TRY_COMPILE([#include + #include + #include ], [ + mkdir("") + ], [ AC_MSG_RESULT(yes) + AC_DEFINE(MKDIR_TAKES_ONE_ARG,[],[Define if mkdir accepts only one arg]) ], + [ AC_MSG_RESULT(no) + ]) + +AH_BOTTOM( +[ +/* Define missing prototypes, implemented in replacement lib */ +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef HAVE_GETOPT +int getopt (int argc, char * const argv[], const char * optstring); +extern char * optarg; +extern int optind, opterr, optopt; +#endif + +#ifndef HAVE_USLEEP +int usleep(unsigned long usec); /* SUSv2 */ +#endif + +#ifndef HAVE_NANOSLEEP +#ifndef HAVE_STRUCT_TIMESPEC +#if HAVE_SYS_TYPES_H +# include /* need time_t */ +#endif +struct timespec { + time_t tv_sec; + long tv_nsec; +}; +#endif +static inline int nanosleep(const struct timespec *req, struct timespec *rem) { return usleep(req->tv_sec*1000000+req->tv_nsec/1000); } +#endif + +#if defined(HAVE_SSLEEP) && !defined(HAVE_SLEEP) +#ifdef HAVE_WINBASE_H +#include +#include +#endif +/* TODO: what about SleepEx? */ +static inline unsigned int sleep (unsigned int nb_sec) { Sleep(nb_sec*1000); return 0; } +#endif + +#ifndef HAVE_GETTIMEOFDAY +#ifdef HAVE_SYS_TIME_H +#include +#endif +#ifndef HAVE_STRUCT_TIMEZONE +struct timezone { + int tz_minuteswest; + int tz_dsttime; +}; +#endif +int gettimeofday(struct timeval *tv, struct timezone *tz); +#endif + +#if !defined(HAVE_RANDOM) && defined(HAVE_RAND) +#include +static inline long int random (void) { return rand(); } +#endif + +#if !defined(HAVE_SRANDOM) && defined(HAVE_SRAND) +static inline void srandom (unsigned int seed) { srand(seed); } +#endif + +#ifndef HAVE_SSIZE_T +typedef size_t ssize_t; +#endif + +#ifdef __cplusplus +} +#endif +]) + + +]) diff --git a/volk/config/gr_set_md_cpu.m4 b/volk/config/gr_set_md_cpu.m4 new file mode 100644 index 000000000..56fd83bac --- /dev/null +++ b/volk/config/gr_set_md_cpu.m4 @@ -0,0 +1,63 @@ +dnl +dnl Copyright 2003,2008 Free Software Foundation, Inc. +dnl +dnl This file is part of GNU Radio +dnl +dnl GNU Radio is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU General Public License as published by +dnl the Free Software Foundation; either version 3, or (at your option) +dnl any later version. +dnl +dnl GNU Radio is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +dnl GNU General Public License for more details. +dnl +dnl You should have received a copy of the GNU General Public License +dnl along with GNU Radio; see the file COPYING. If not, write to +dnl the Free Software Foundation, Inc., 51 Franklin Street, +dnl Boston, MA 02110-1301, USA. +dnl + +AC_DEFUN([_TRY_ADD_ALTIVEC], +[ + LF_CHECK_CC_FLAG([-mabi=altivec -maltivec]) + LF_CHECK_CXX_FLAG([-mabi=altivec -maltivec]) +]) + +AC_DEFUN([GR_SET_MD_CPU],[ + AC_REQUIRE([AC_CANONICAL_HOST]) + AC_ARG_WITH(md-cpu, + AC_HELP_STRING([--with-md-cpu=ARCH],[set machine dependent speedups (auto)]), + [cf_with_md_cpu="$withval"], + [cf_with_md_cpu="$host_cpu"]) + + case "$cf_with_md_cpu" in + x86 | i[[3-7]]86) MD_CPU=x86 MD_SUBCPU=x86 ;; + x86_64) MD_CPU=x86 MD_SUBCPU=x86_64 ;; + powerpc*) MD_CPU=powerpc ;; + *) MD_CPU=generic ;; + esac + + AC_ARG_ENABLE(altivec, + AC_HELP_STRING([--enable-altivec],[enable altivec on PowerPC (yes)]), + [ if test $MD_CPU = powerpc; then + case "$enableval" in + (no) MD_CPU=generic ;; + (yes) _TRY_ADD_ALTIVEC ;; + (*) AC_MSG_ERROR([Invalid argument ($enableval) to --enable-altivec]) ;; + esac + fi], + [ if test $MD_CPU = powerpc; then _TRY_ADD_ALTIVEC fi]) + + + AC_MSG_CHECKING([for machine dependent speedups]) + AC_MSG_RESULT($MD_CPU) + AC_SUBST(MD_CPU) + AC_SUBST(MD_SUBCPU) + + AM_CONDITIONAL(MD_CPU_x86, test "$MD_CPU" = "x86") + AM_CONDITIONAL(MD_SUBCPU_x86_64, test "$MD_SUBCPU" = "x86_64") + AM_CONDITIONAL(MD_CPU_powerpc, test "$MD_CPU" = "powerpc") + AM_CONDITIONAL(MD_CPU_generic, test "$MD_CPU" = "generic") +]) diff --git a/volk/config/gr_sysv_shm.m4 b/volk/config/gr_sysv_shm.m4 new file mode 100644 index 000000000..db5c8351e --- /dev/null +++ b/volk/config/gr_sysv_shm.m4 @@ -0,0 +1,36 @@ +# Check for IPC System V shm support. -*- Autoconf -*- + +# Copyright 2003 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Boston, MA +# 02110-1301, USA. + +AC_DEFUN([GR_SYSV_SHM], +[ + AC_LANG_SAVE + AC_LANG_C + + AC_CHECK_HEADERS([sys/ipc.h sys/shm.h]) + + save_LIBS="$LIBS" + AC_SEARCH_LIBS(shmat, [cygipc ipc], + [ IPC_LIBS="$LIBS" ], + [ AC_MSG_WARN([SystemV IPC support not found. ]) ] + ) + LIBS="$save_LIBS" + + AC_LANG_RESTORE + AC_SUBST(IPC_LIBS) +]) diff --git a/volk/config/lf_cc.m4 b/volk/config/lf_cc.m4 new file mode 100644 index 000000000..b75e1a4c5 --- /dev/null +++ b/volk/config/lf_cc.m4 @@ -0,0 +1,41 @@ +dnl Autoconf support for C++ +dnl Copyright (C) 1988 Eleftherios Gkioulekas +dnl +dnl This program is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU General Public License as published by +dnl the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. +dnl +dnl This program is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +dnl GNU General Public License for more details. +dnl +dnl You should have received a copy of the GNU General Public License +dnl along with this program; if not, write to the Free Software +dnl Foundation, Inc., 51 Franklin Street, Boston, MA 02110-1301, USA. +dnl +dnl As a special exception to the GNU General Public License, if you +dnl distribute this file as part of a program that contains a configuration +dnl script generated by Autoconf, you may include it under the same +dnl distribution terms that you use for the rest of that program. + +# ------------------------------------------------------------------------- +# Use this macro to configure your C compiler +# When called the macro does the following things: +# 1. It finds an appropriate C compiler. +# If you passed the flag --with-cc=foo then it uses that +# particular compiler +# 2. Check whether the compiler works. +# 3. Checks whether the compiler accepts the -g +# ------------------------------------------------------------------------- + +AC_DEFUN([LF_CONFIGURE_CC],[ + dnl Sing the song + AC_REQUIRE([AC_PROG_CC])dnl + AC_REQUIRE([AC_PROG_CPP])dnl + AC_REQUIRE([AC_AIX])dnl + AC_REQUIRE([AC_ISC_POSIX])dnl + AC_REQUIRE([AC_HEADER_STDC])dnl +]) + diff --git a/volk/config/lf_cxx.m4 b/volk/config/lf_cxx.m4 new file mode 100644 index 000000000..dfc6bfbfe --- /dev/null +++ b/volk/config/lf_cxx.m4 @@ -0,0 +1,67 @@ +dnl Autoconf support for C++ +dnl Copyright (C) 1988 Eleftherios Gkioulekas +dnl +dnl This program is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU General Public License as published by +dnl the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. +dnl +dnl This program is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +dnl GNU General Public License for more details. +dnl +dnl You should have received a copy of the GNU General Public License +dnl along with this program; if not, write to the Free Software +dnl Foundation, Inc., 51 Franklin Street, Boston, MA 02110-1301, USA. +dnl +dnl As a special exception to the GNU General Public License, if you +dnl distribute this file as part of a program that contains a configuration +dnl script generated by Autoconf, you may include it under the same +dnl distribution terms that you use for the rest of that program. + +# ----------------------------------------------------------------- +# This macro should be called to configure your C++ compiler. +# When called, the macro does the following things: +# 1. It finds an appropriate C++ compiler +# If you passed the flag --with-cxx=foo, then it uses that +# particular compiler +# 2. Checks whether the compiler accepts the -g +# ------------------------------------------------------------------ + +AC_DEFUN([LF_CONFIGURE_CXX],[ + AC_REQUIRE([AC_PROG_CXX])dnl + AC_REQUIRE([AC_PROG_CXXCPP])dnl + LF_CXX_PORTABILITY +]) + +# ----------------------------------------------------------------------- +# This macro tests the C++ compiler for various portability problem. +# ----------------------------------------------------------------------- + + +AC_DEFUN([LF_CXX_PORTABILITY],[ + + dnl + dnl Check for common C++ portability problems + dnl + + dnl AC_LANG_PUSH + dnl AC_LANG_CPLUSPLUS + AC_LANG_SAVE + AC_LANG_CPLUSPLUS + + + dnl Test whether C++ has std::isnan + AC_MSG_CHECKING(whether C++ has std::isnan) + AC_TRY_COMPILE([#include ], [ + std::isnan(0); +], [ AC_MSG_RESULT(yes) + AC_DEFINE(CXX_HAS_STD_ISNAN,[],[Define if has std::isnan]) ], + [ AC_MSG_RESULT(no) ]) + + dnl Done with the portability checks + dnl AC_LANG_POP([C++]) + AC_LANG_RESTORE +]) + diff --git a/volk/config/lf_warnings.m4 b/volk/config/lf_warnings.m4 new file mode 100644 index 000000000..d40c77f14 --- /dev/null +++ b/volk/config/lf_warnings.m4 @@ -0,0 +1,121 @@ +dnl Copyright (C) 1988 Eleftherios Gkioulekas +dnl Copyright (C) 2009 Free Software Foundation, Inc. +dnl +dnl This program is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU General Public License as published by +dnl the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. +dnl +dnl This program is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +dnl GNU General Public License for more details. +dnl +dnl You should have received a copy of the GNU General Public License +dnl along with this program; if not, write to the Free Software +dnl Foundation, Inc., 51 Franklin Street, Boston, MA 02110-1301, USA. +dnl +dnl As a special exception to the GNU General Public License, if you +dnl distribute this file as part of a program that contains a configuration +dnl script generated by Autoconf, you may include it under the same +dnl distribution terms that you use for the rest of that program. + +# -------------------------------------------------------------------------- +# Check whether the C++ compiler accepts a certain flag +# If it does it adds the flag to lf_CXXFLAGS +# If it does not then it returns an error to lf_ok +# Usage: +# LF_CHECK_CXX_FLAG(-flag1 -flag2 -flag3 ...) +# ------------------------------------------------------------------------- + +AC_DEFUN([LF_CHECK_CXX_FLAG],[ + echo 'void f(){}' > conftest.cc + for i in $1 + do + AC_MSG_CHECKING([whether $CXX accepts $i]) + if test -z "`${CXX} $i -c conftest.cc 2>&1`" + then + lf_CXXFLAGS="${lf_CXXFLAGS} $i" + AC_MSG_RESULT(yes) + else + AC_MSG_RESULT(no) + fi + done + rm -f conftest.cc conftest.o + AC_SUBST(lf_CXXFLAGS) +]) + +# -------------------------------------------------------------------------- +# Check whether the C compiler accepts a certain flag +# If it does it adds the flag to lf_CFLAGS +# If it does not then it returns an error to lf_ok +# Usage: +# LF_CHECK_CC_FLAG(-flag1 -flag2 -flag3 ...) +# ------------------------------------------------------------------------- + +AC_DEFUN([LF_CHECK_CC_FLAG],[ + echo 'void f(){}' > conftest.c + for i in $1 + do + AC_MSG_CHECKING([whether $CC accepts $i]) + if test -z "`${CC} $i -c conftest.c 2>&1`" + then + lf_CFLAGS="${lf_CFLAGS} $i" + AC_MSG_RESULT(yes) + else + AC_MSG_RESULT(no) + fi + done + rm -f conftest.c conftest.o + AC_SUBST(lf_CFLAGS) +]) + +# -------------------------------------------------------------------------- +# Check whether the Fortran compiler accepts a certain flag +# If it does it adds the flag to lf_FFLAGS +# If it does not then it returns an error to lf_ok +# Usage: +# LF_CHECK_F77_FLAG(-flag1 -flag2 -flag3 ...) +# ------------------------------------------------------------------------- + +AC_DEFUN([LF_CHECK_F77_FLAG],[ + cat << EOF > conftest.f +c....:++++++++++++++++++++++++ + PROGRAM MAIN + PRINT*,'Hello World!' + END +EOF + for i in $1 + do + AC_MSG_CHECKING([whether $F77 accepts $i]) + if test -z "`${F77} $i -c conftest.f 2>&1`" + then + lf_FFLAGS="${lf_FFLAGS} $i" + AC_MSG_RESULT(yes) + else + AC_MSG_RESULT(no) + fi + done + rm -f conftest.f conftest.o + AC_SUBST(lf_FFLAGS) +]) + +# ---------------------------------------------------------------------- +# Enable compiler warnings. +# Call this command AFTER you have configured ALL your compilers. +# ---------------------------------------------------------------------- + +AC_DEFUN([LF_SET_WARNINGS],[ + dnl Warnings for the two main compilers + dnl add -Wextra when you're got time to fix a bunch of them ;-) + cc_warning_flags="-Wall -Werror-implicit-function-declaration" + cxx_warning_flags="-Wall -Woverloaded-virtual" + if test -n "${CC}" + then + LF_CHECK_CC_FLAG($cc_warning_flags) + fi + if test -n "${CXX}" + then + LF_CHECK_CXX_FLAG($cxx_warning_flags) + fi +]) diff --git a/volk/config/lf_x11.m4 b/volk/config/lf_x11.m4 new file mode 100644 index 000000000..460cd605f --- /dev/null +++ b/volk/config/lf_x11.m4 @@ -0,0 +1,39 @@ +dnl Copyright (C) 1988 Eleftherios Gkioulekas +dnl +dnl This program is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU General Public License as published by +dnl the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. +dnl +dnl This program is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +dnl GNU General Public License for more details. +dnl +dnl You should have received a copy of the GNU General Public License +dnl along with this program; if not, write to the Free Software +dnl Foundation, Inc., 51 Franklin Street, Boston, MA 02110-1301, USA. +dnl +dnl As a special exception to the GNU General Public License, if you +dnl distribute this file as part of a program that contains a configuration +dnl script generated by Autoconf, you may include it under the same +dnl distribution terms that you use for the rest of that program. + + +#----------------------------------------------------------------------- +# This macro searches for Xlib and when it finds it it adds the +# appropriate flags to CXXFLAGS and export the link sequence to +# the variable XLIB. +# In your configure.in file add: +# LF_PATH_XLIB +# In your Makefile.am add +# program_LDADD = .... $(XLIB) +#------------------------------------------------------------------------ + +AC_DEFUN([LF_PATH_XLIB],[ + AC_PATH_XTRA + CXXFLAGS="$CXXFLAGS $X_CFLAGS" + XLIB="$X_LIBS $X_PRE_LIBS -lX11 $X_EXTRA_LIBS" + AC_SUBST(XLIB) +]) + diff --git a/volk/config/lv_configure.m4 b/volk/config/lv_configure.m4 new file mode 100644 index 000000000..fc6a0a567 --- /dev/null +++ b/volk/config/lv_configure.m4 @@ -0,0 +1,110 @@ +dnl all this stuff taken and modified from GNURADIO! +dnl +dnl LV_CONFIGURE +dnl +dnl Handles the bulk of the configure.ac work for an out-of-tree build +dnl +dnl You must invoke: +dnl +dnl AC_INIT(package_name,version) +dnl AC_PREREQ(2.57) +dnl AC_CONFIG_AUX_DIR([.]) +dnl +dnl in configure.ac before LV_CONFIGURE +dnl +dnl +dnl N.B., this is an m4_define because if it were an AC_DEFUN it would +dnl get called too late to be useful. + +m4_define([LV_CONFIGURE], +[ + + AC_CONFIG_SRCDIR([config/lv_configure.m4]) + AC_CONFIG_AUX_DIR([.]) + AM_CONFIG_HEADER(config.h) + + AC_CANONICAL_BUILD + AC_CANONICAL_HOST + AC_CANONICAL_TARGET + + AM_INIT_AUTOMAKE(libvector,0.0svn) + + LF_CONFIGURE_CC + LF_CONFIGURE_CXX + GR_LIB64 dnl check for lib64 suffix after choosing compilers + + LV_GCC_VERSION_WORKAROUND + dnl add ${prefix}/lib${gr_libdir_suffix}/pkgconfig to the head of the PKG_CONFIG_PATH + if test x${PKG_CONFIG_PATH} = x; then + PKG_CONFIG_PATH=${prefix}/lib${gr_libdir_suffix}/pkgconfig + else + PKG_CONFIG_PATH=${prefix}/lib${gr_libdir_suffix}/pkgconfig:${PKG_CONFIG_PATH} + fi + export PKG_CONFIG_PATH + + LF_SET_WARNINGS +dnl GR_SET_GPROF +dnl GR_SET_PROF + AM_PROG_AS + AC_PROG_LN_S + AC_PROG_MAKE_SET + AC_PROG_INSTALL + AC_PATH_PROG([RM_PROG], [rm]) + + AC_LIBTOOL_WIN32_DLL + AC_ENABLE_SHARED dnl do build shared libraries... important for qa + AC_DISABLE_STATIC dnl don't build static libraries... important for qa + m4_ifdef([LT_INIT],[LT_INIT],[AC_PROG_LIBTOOL]) + dnl GR_FORTRAN + + GR_NO_UNDEFINED dnl do we need the -no-undefined linker flag +dnl GR_SCRIPTING dnl Locate python, SWIG, etc + +dnl AC_ARG_WITH([python], +dnl AC_HELP_STRING([--with-python], [Should we use python? [[default=yes]]]), +dnl [case "$with_python" in +dnl (no | yes) ;; +dnl (*) AC_MSG_ERROR([Invalid argument ($with_python) to --with-python]) ;; +dnl esac], +dnl [with_python=yes]) + +dnl AM_CONDITIONAL([USE_PYTHON], [test "$with_python" = yes]) + + + dnl Set the c++ compiler that we use for the build system when cross compiling + if test "x$CXX_FOR_BUILD" = x + then + CXX_FOR_BUILD=${CXX} + fi + AC_SUBST(CXX_FOR_BUILD) + + dnl Checks for header files. + AC_HEADER_STDC + + dnl Checks for typedefs, structures, and compiler characteristics. + AC_C_CONST + AC_C_INLINE + AC_TYPE_SIZE_T + AC_HEADER_TIME + AC_C_BIGENDIAN + + dnl Check for Mingw support + GR_PWIN32 + GR_LIBGNURADIO_CORE_EXTRA_LDFLAGS + + LDFLAGS="$LDFLAGS $LIBGNURADIO_CORE_EXTRA_LDFLAGS" + + AC_CHECK_PROG([XMLTO],[xmlto],[yes],[]) + AM_CONDITIONAL([HAS_XMLTO], [test x$XMLTO = xyes]) + + dnl Define where to look for cppunit includes and libs + dnl sets CPPUNIT_CFLAGS and CPPUNIT_LIBS + dnl Try using pkg-config first, then fall back to cppunit-config. + PKG_CHECK_EXISTS(cppunit, + [PKG_CHECK_MODULES(CPPUNIT, cppunit >= 1.9.14)], + [AM_PATH_CPPUNIT([1.9.14],[], + [AC_MSG_ERROR([LIBVECTOR requires cppunit. Stop])])]) + +dnl PKG_CHECK_MODULES(GNURADIO_CORE, gnuradio-core >= 3) +dnl LIBS="$LIBS $GNURADIO_CORE_LIBS" +]) diff --git a/volk/config/mkstemp.m4 b/volk/config/mkstemp.m4 new file mode 100644 index 000000000..4af0f0a9b --- /dev/null +++ b/volk/config/mkstemp.m4 @@ -0,0 +1,89 @@ +#serial 4 + +# On some hosts (e.g., HP-UX 10.20, SunOS 4.1.4, Solaris 2.5.1), mkstemp has a +# silly limit that it can create no more than 26 files from a given template. +# Other systems lack mkstemp altogether. +# On OSF1/Tru64 V4.0F, the system-provided mkstemp function can create +# only 32 files per process. +# On systems like the above, arrange to use the replacement function. +AC_DEFUN([UTILS_FUNC_MKSTEMP], +[dnl + AC_REPLACE_FUNCS(mkstemp) + if test $ac_cv_func_mkstemp = no; then + utils_cv_func_mkstemp_limitations=yes + else + AC_CACHE_CHECK([for mkstemp limitations], + utils_cv_func_mkstemp_limitations, + [ + AC_TRY_RUN([ +# include + int main () + { + int i; + for (i = 0; i < 70; i++) + { + char template[] = "conftestXXXXXX"; + int fd = mkstemp (template); + if (fd == -1) + exit (1); + close (fd); + } + exit (0); + } + ], + utils_cv_func_mkstemp_limitations=no, + utils_cv_func_mkstemp_limitations=yes, + utils_cv_func_mkstemp_limitations=yes + ) + ] + ) + fi + + if test $utils_cv_func_mkstemp_limitations = yes; then + AC_LIBOBJ(mkstemp) + AC_LIBOBJ(tempname) + AC_DEFINE(mkstemp, rpl_mkstemp, + [Define to rpl_mkstemp if the replacement function should be used.]) + gl_PREREQ_MKSTEMP + jm_PREREQ_TEMPNAME + fi +]) + +# Prerequisites of lib/mkstemp.c. +AC_DEFUN([gl_PREREQ_MKSTEMP], +[ + AH_BOTTOM( + [ + #ifndef HAVE_MKSTEMP + #ifdef __cplusplus + extern "C" { + #endif + int rpl_mkstemp (char *templ); + #ifdef __cplusplus + } + #endif + #endif + ]) +]) + +# Prerequisites of lib/tempname.c. +AC_DEFUN([jm_PREREQ_TEMPNAME], +[ + AC_REQUIRE([AC_HEADER_STAT]) + AC_CHECK_HEADERS_ONCE(fcntl.h sys/time.h unistd.h) + AC_CHECK_HEADERS(stdint.h) + AC_CHECK_FUNCS(__secure_getenv gettimeofday lstat) + AC_CHECK_DECLS_ONCE(getenv) + # AC_REQUIRE([jm_AC_TYPE_UINTMAX_T]) + + dnl Under Win32, mkdir prototype in io.h has only one arg + AC_MSG_CHECKING(whether mkdir accepts only one arg) + AC_TRY_COMPILE([#include + #include + #include ], [ + mkdir("") + ], [ AC_MSG_RESULT(yes) + AC_DEFINE(MKDIR_TAKES_ONE_ARG,[],[Define if mkdir accepts only one arg]) ], + [ AC_MSG_RESULT(no) + ]) +]) diff --git a/volk/config/onceonly.m4 b/volk/config/onceonly.m4 new file mode 100644 index 000000000..f6fec37cb --- /dev/null +++ b/volk/config/onceonly.m4 @@ -0,0 +1,63 @@ +# onceonly.m4 serial 3 +dnl Copyright (C) 2002, 2003 Free Software Foundation, Inc. +dnl This file is free software, distributed under the terms of the GNU +dnl General Public License. As a special exception to the GNU General +dnl Public License, this file may be distributed as part of a program +dnl that contains a configuration script generated by Autoconf, under +dnl the same distribution terms as the rest of that program. + +dnl This file defines some "once only" variants of standard autoconf macros. +dnl AC_CHECK_HEADERS_ONCE like AC_CHECK_HEADERS +dnl AC_CHECK_FUNCS_ONCE like AC_CHECK_FUNCS +dnl AC_CHECK_DECLS_ONCE like AC_CHECK_DECLS +dnl AC_REQUIRE([AC_HEADER_STDC]) like AC_HEADER_STDC +dnl The advantage is that the check for each of the headers/functions/decls +dnl will be put only once into the 'configure' file. It keeps the size of +dnl the 'configure' file down, and avoids redundant output when 'configure' +dnl is run. +dnl The drawback is that the checks cannot be conditionalized. If you write +dnl if some_condition; then gl_CHECK_HEADERS(stdlib.h); fi +dnl inside an AC_DEFUNed function, the gl_CHECK_HEADERS macro call expands to +dnl empty, and the check will be inserted before the body of the AC_DEFUNed +dnl function. + +dnl Autoconf version 2.57 or newer is recommended. +AC_PREREQ(2.54) + +# AC_CHECK_HEADERS_ONCE(HEADER1 HEADER2 ...) is a once-only variant of +# AC_CHECK_HEADERS(HEADER1 HEADER2 ...). +AC_DEFUN([AC_CHECK_HEADERS_ONCE], [ + : + AC_FOREACH([gl_HEADER_NAME], [$1], [ + AC_DEFUN([gl_CHECK_HEADER_]m4_quote(translit(defn([gl_HEADER_NAME]), + [-./], [___])), [ + AC_CHECK_HEADERS(gl_HEADER_NAME) + ]) + AC_REQUIRE([gl_CHECK_HEADER_]m4_quote(translit(gl_HEADER_NAME, + [-./], [___]))) + ]) +]) + +# AC_CHECK_FUNCS_ONCE(FUNC1 FUNC2 ...) is a once-only variant of +# AC_CHECK_FUNCS(FUNC1 FUNC2 ...). +AC_DEFUN([AC_CHECK_FUNCS_ONCE], [ + : + AC_FOREACH([gl_FUNC_NAME], [$1], [ + AC_DEFUN([gl_CHECK_FUNC_]defn([gl_FUNC_NAME]), [ + AC_CHECK_FUNCS(defn([gl_FUNC_NAME])) + ]) + AC_REQUIRE([gl_CHECK_FUNC_]defn([gl_FUNC_NAME])) + ]) +]) + +# AC_CHECK_DECLS_ONCE(DECL1 DECL2 ...) is a once-only variant of +# AC_CHECK_DECLS(DECL1, DECL2, ...). +AC_DEFUN([AC_CHECK_DECLS_ONCE], [ + : + AC_FOREACH([gl_DECL_NAME], [$1], [ + AC_DEFUN([gl_CHECK_DECL_]defn([gl_DECL_NAME]), [ + AC_CHECK_DECLS(defn([gl_DECL_NAME])) + ]) + AC_REQUIRE([gl_CHECK_DECL_]defn([gl_DECL_NAME])) + ]) +]) diff --git a/volk/config/pkg.m4 b/volk/config/pkg.m4 new file mode 100644 index 000000000..2d4d96109 --- /dev/null +++ b/volk/config/pkg.m4 @@ -0,0 +1,201 @@ +# pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- +# +# Copyright © 2004 Scott James Remnant . +# Copyright © 2008 Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# PKG_PROG_PKG_CONFIG([MIN-VERSION]) +# ---------------------------------- +AC_DEFUN([PKG_PROG_PKG_CONFIG], +[m4_pattern_forbid([^_?PKG_[A-Z_]+$]) +m4_pattern_allow([^PKG_CONFIG(_PATH)?$]) +AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility])dnl +if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then + AC_PATH_TOOL([PKG_CONFIG], [pkg-config]) +fi +if test -n "$PKG_CONFIG"; then + _pkg_min_version=m4_default([$1], [0.18]) + AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version]) + if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + PKG_CONFIG="" + fi + +fi[]dnl +])# PKG_PROG_PKG_CONFIG + +# PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) +# +# Check to see whether a particular set of modules exists. Similar +# to PKG_CHECK_MODULES(), but does not set variables or print errors. +# +# +# Similar to PKG_CHECK_MODULES, make sure that the first instance of +# this or PKG_CHECK_MODULES is called, or make sure to call +# PKG_CHECK_EXISTS manually +# -------------------------------------------------------------- +AC_DEFUN([PKG_CHECK_EXISTS], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl +if test -n "$PKG_CONFIG" && \ + AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then + m4_ifval([$2], [$2], [:]) +m4_ifvaln([$3], [else + $3])dnl +fi]) + + +# _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) +# --------------------------------------------- +m4_define([_PKG_CONFIG], +[if test -n "$PKG_CONFIG"; then + if test -n "$$1"; then + pkg_cv_[]$1="$$1" + else + PKG_CHECK_EXISTS([$3], + [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`], + [pkg_failed=yes]) + fi +else + pkg_failed=untried +fi[]dnl +])# _PKG_CONFIG + +# _PKG_SHORT_ERRORS_SUPPORTED +# ----------------------------- +AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG]) +if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then + _pkg_short_errors_supported=yes +else + _pkg_short_errors_supported=no +fi[]dnl +])# _PKG_SHORT_ERRORS_SUPPORTED + + +# PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], +# [ACTION-IF-NOT-FOUND]) +# +# E.g., +# PKG_CHECK_MODULES(GSTUFF, gtk+-2.0 >= 1.3 glib = 1.3.4, action-if, action-not) +# defines: +# +# GSTUFF_LIBS +# GSTUFF_CFLAGS +# GSTUFF_INCLUDEDIR +# GSTUFF_CPPFLAGS # the -I, -D and -U's out of CFLAGS +# +# see pkg-config man page also defines GSTUFF_PKG_ERRORS on error +# +# Note that if there is a possibility the first call to +# PKG_CHECK_MODULES might not happen, you should be sure to include an +# explicit call to PKG_PROG_PKG_CONFIG in your configure.ac +# +# -------------------------------------------------------------- +AC_DEFUN([PKG_CHECK_MODULES],[ +AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl +AC_REQUIRE([AC_CANONICAL_HOST])dnl +AC_REQUIRE([AC_CANONICAL_BUILD])dnl + +AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl +AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl +AC_ARG_VAR([$1][_INCLUDEDIR], [includedir for $1, overriding pkg-config])dnl + +pkg_failed=no +AC_MSG_CHECKING([for $1]) + +_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2]) + +if test x$cross_compiling = xyes +then + dnl _PKG_CONFIG([$1][_LIBS], [libs-only-l --static], [$2]) + _PKG_CONFIG([$1][_LIBS], [libs --static], [$2]) + dnl prune out any -L/lib or -L/usr/lib since they're pointing to the wrong filesystem root + _pkg_tmp= + for flag in [$]pkg_cv_[$1][_LIBS]; do + case $flag in + (-L/lib* | -L/usr/lib* ) ;; # ignore + (*) _pkg_tmp="$_pkg_tmp $flag" ;; + esac + done + pkg_cv_[$1][_LIBS]="$_pkg_tmp" +else + _PKG_CONFIG([$1][_LIBS], [libs --static], [$2]) +fi + +_PKG_CONFIG([$1][_INCLUDEDIR], [variable=includedir], [$2]) + + +m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS +and $1[]_LIBS to avoid the need to call pkg-config. +See the pkg-config man page for more details.]) + +if test $pkg_failed = yes; then + _PKG_SHORT_ERRORS_SUPPORTED + if test $_pkg_short_errors_supported = yes; then + $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --errors-to-stdout --print-errors "$2"` + else + $1[]_PKG_ERRORS=`$PKG_CONFIG --errors-to-stdout --print-errors "$2"` + fi + # Put the nasty error message in config.log where it belongs + echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD + + ifelse([$4], , [AC_MSG_ERROR(dnl +[Package requirements ($2) were not met: + +$$1_PKG_ERRORS + +Consider adjusting the PKG_CONFIG_PATH environment variable if you +installed software in a non-standard prefix. + +_PKG_TEXT +])], + [AC_MSG_RESULT([no]) + $4]) +elif test $pkg_failed = untried; then + ifelse([$4], , [AC_MSG_FAILURE(dnl +[The pkg-config script could not be found or is too old. Make sure it +is in your PATH or set the PKG_CONFIG environment variable to the full +path to pkg-config. + +_PKG_TEXT + +To get pkg-config, see .])], + [$4]) +else + $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS + $1[]_LIBS=$pkg_cv_[]$1[]_LIBS + $1[]_INCLUDEDIR=$pkg_cv_[]$1[]_INCLUDEDIR + + $1[]_CPPFLAGS="" + for flag in $$1[]_CFLAGS; do + case $flag in + -I* | -D* | -U*) $1[]_CPPFLAGS="$$1[]_CPPFLAGS $flag" ;; + esac + done + pkg_cv_[]$1[]_CPPFLAGS=$$1[]_CPPFLAGS + AC_SUBST($1[]_CPPFLAGS) + + AC_MSG_RESULT([yes]) + ifelse([$3], , :, [$3]) +fi[]dnl +])# PKG_CHECK_MODULES diff --git a/volk/configure.ac b/volk/configure.ac new file mode 100644 index 000000000..eb9fbdc55 --- /dev/null +++ b/volk/configure.ac @@ -0,0 +1,62 @@ +AC_INIT(libvector,0.0svn) +AC_PREREQ(2.57) +AC_CONFIG_AUX_DIR([.]) + + +dnl This is kind of non-standard, but it sure shortens up this file :-) + +m4_include([config/lv_configure.m4]) + +LV_CONFIGURE + +dnl Check for any libraries you need +dnl AC_CHECK_LIBRARY + +dnl Check for header files you need +dnl AC_CHECK_HEADERS(fcntl.h limits.h strings.h sys/ioctl.h sys/time.h unistd.h) +dnl AC_CHECK_HEADERS(sys/mman.h) + +dnl Checks for library functions. +dnl AC_CHECK_FUNCS([]) + +dnl We pick up the boost cppflags, cxxflags and thread lib via GNURADIO_CORE +dnl +dnl If you need additional boost libraries, you'll need to +dnl uncomment AX_BOOST_BASE, plus some of the following: +dnl +dnl calls AC_SUBST(BOOST_CPPFLAGS), AC_SUBST(BOOST_LDFLAGS) and defines HAVE_BOOST +dnl AX_BOOST_BASE([1.35]) +dnl +dnl All the rest of these call AC_SUBST(BOOST__LIB) and define HAVE_BOOST_ +dnl +dnl AX_BOOST_DATE_TIME +dnl AX_BOOST_FILESYSTEM +dnl AX_BOOST_IOSTREAMS +dnl AX_BOOST_PROGRAM_OPTIONS +dnl AX_BOOST_REGEX +dnl AX_BOOST_SERIALIZATION +dnl AX_BOOST_SIGNALS +dnl AX_BOOST_SYSTEM +dnl AX_BOOST_TEST_EXEC_MONITOR +dnl AX_BOOST_UNIT_TEST_FRAMEWORK +dnl AX_BOOST_WSERIALIZATION + +AC_CONFIG_HEADERS([volk_config.h]) +LV_SET_SIMD_FLAGS + +AC_CONFIG_FILES([\ + Makefile \ + config/Makefile \ + include/Makefile \ + include/volk/Makefile \ + lib/Makefile \ + volk.pc \ + ]) + + + + + +AC_CONFIG_COMMANDS([run_system_cleanup], [chmod +x system_cleanup.sh && ./system_cleanup.sh $MYCPU $MYSUBCPU], [MYCPU=$MD_CPU MYSUBCPU=$MD_SUBCPU]) + +AC_OUTPUT diff --git a/volk/data/Makefile.am b/volk/data/Makefile.am new file mode 100644 index 000000000..d0ef08bc9 --- /dev/null +++ b/volk/data/Makefile.am @@ -0,0 +1,30 @@ +# +# Copyright 2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +include $(top_srcdir)/Makefile.common + +# List your data files here. Please keep it under ~ 100KB. +# +# EXTRA_DIST = \ +# foo.dat \ +# bar.dat \ +# baz.dat + +EXTRA_DIST = diff --git a/volk/doc/Makefile.am b/volk/doc/Makefile.am new file mode 100644 index 000000000..cddd72435 --- /dev/null +++ b/volk/doc/Makefile.am @@ -0,0 +1,32 @@ +# +# Copyright 2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +include $(top_srcdir)/Makefile.common + +# List your doc files here +# +# EXTRA_DIST = \ +# on-the-meaning-of-life \ +# whatever-doesnt-make-you-stronger-kills-you +# + +EXTRA_DIST = + +# and/or run doxygen, xmlto, etc diff --git a/volk/include/Makefile.am b/volk/include/Makefile.am new file mode 100644 index 000000000..375d1a7d5 --- /dev/null +++ b/volk/include/Makefile.am @@ -0,0 +1,23 @@ +# +# Copyright 2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +include $(top_srcdir)/Makefile.common + +SUBDIRS = volk diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am new file mode 100644 index 000000000..fd59ab795 --- /dev/null +++ b/volk/include/volk/Makefile.am @@ -0,0 +1,133 @@ +# +# Copyright 2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +include $(top_srcdir)/Makefile.common + +ourinclude_HEADERS = \ + volk_complex.h \ + volk_common.h \ + volk_config_fixed.h \ + volk_runtime.h \ + volk_config.h \ + volk_tables.h \ + volk_typedefs.h \ + volk.h \ + volk_cpu.h \ + volk_environment_init.h \ + volk_16s_add_quad_aligned16.h \ + volk_16s_branch_4_state_8_aligned16.h \ + volk_16sc_deinterleave_16s_aligned16.h \ + volk_16sc_deinterleave_32f_aligned16.h \ + volk_16sc_deinterleave_real_16s_aligned16.h \ + volk_16sc_deinterleave_real_32f_aligned16.h \ + volk_16sc_deinterleave_real_8s_aligned16.h \ + volk_16sc_magnitude_16s_aligned16.h \ + volk_16sc_magnitude_32f_aligned16.h \ + volk_16s_convert_32f_aligned16.h \ + volk_16s_convert_32f_unaligned16.h \ + volk_16s_convert_8s_aligned16.h \ + volk_16s_convert_8s_unaligned16.h \ + volk_16s_max_star_aligned16.h \ + volk_16s_max_star_horizontal_aligned16.h \ + volk_16s_permute_and_scalar_add_aligned16.h \ + volk_16s_quad_max_star_aligned16.h \ + volk_16u_byteswap_aligned16.h \ + volk_32f_accumulator_aligned16.h \ + volk_32f_add_aligned16.h \ + volk_32fc_32f_multiply_aligned16.h \ + volk_32fc_32f_power_32fc_aligned16.h \ + volk_32f_calc_spectral_noise_floor_aligned16.h \ + volk_32fc_atan2_32f_aligned16.h \ + volk_32fc_conjugate_dot_prod_aligned16.h \ + volk_32fc_deinterleave_32f_aligned16.h \ + volk_32fc_deinterleave_64f_aligned16.h \ + volk_32fc_deinterleave_real_16s_aligned16.h \ + volk_32fc_deinterleave_real_32f_aligned16.h \ + volk_32fc_deinterleave_real_64f_aligned16.h \ + volk_32fc_dot_prod_aligned16.h \ + volk_32fc_index_max_aligned16.h \ + volk_32fc_magnitude_16s_aligned16.h \ + volk_32fc_magnitude_32f_aligned16.h \ + volk_32fc_multiply_aligned16.h \ + volk_32f_convert_16s_aligned16.h \ + volk_32f_convert_16s_unaligned16.h \ + volk_32f_convert_32s_aligned16.h \ + volk_32f_convert_32s_unaligned16.h \ + volk_32f_convert_64f_aligned16.h \ + volk_32f_convert_64f_unaligned16.h \ + volk_32f_convert_8s_aligned16.h \ + volk_32f_convert_8s_unaligned16.h \ + volk_32fc_power_spectral_density_32f_aligned16.h \ + volk_32fc_power_spectrum_32f_aligned16.h \ + volk_32fc_square_dist_aligned16.h \ + volk_32fc_square_dist_scalar_mult_aligned16.h \ + volk_32f_divide_aligned16.h \ + volk_32f_dot_prod_aligned16.h \ + volk_32f_dot_prod_unaligned16.h \ + volk_32f_fm_detect_aligned16.h \ + volk_32f_index_max_aligned16.h \ + volk_32f_interleave_16sc_aligned16.h \ + volk_32f_interleave_32fc_aligned16.h \ + volk_32f_max_aligned16.h \ + volk_32f_min_aligned16.h \ + volk_32f_multiply_aligned16.h \ + volk_32f_normalize_aligned16.h \ + volk_32f_power_aligned16.h \ + volk_32f_sqrt_aligned16.h \ + volk_32f_stddev_aligned16.h \ + volk_32f_stddev_and_mean_aligned16.h \ + volk_32f_subtract_aligned16.h \ + volk_32f_sum_of_poly_aligned16.h \ + volk_32s_and_aligned16.h \ + volk_32s_convert_32f_aligned16.h \ + volk_32s_convert_32f_unaligned16.h \ + volk_32s_or_aligned16.h \ + volk_32u_byteswap_aligned16.h \ + volk_32u_popcnt_aligned16.h \ + volk_64f_convert_32f_aligned16.h \ + volk_64f_convert_32f_unaligned16.h \ + volk_64f_max_aligned16.h \ + volk_64f_min_aligned16.h \ + volk_64u_byteswap_aligned16.h \ + volk_64u_popcnt_aligned16.h \ + volk_8sc_deinterleave_16s_aligned16.h \ + volk_8sc_deinterleave_32f_aligned16.h \ + volk_8sc_deinterleave_real_16s_aligned16.h \ + volk_8sc_deinterleave_real_32f_aligned16.h \ + volk_8sc_deinterleave_real_8s_aligned16.h \ + volk_8sc_multiply_conjugate_16sc_aligned16.h \ + volk_8sc_multiply_conjugate_32fc_aligned16.h \ + volk_8s_convert_16s_aligned16.h \ + volk_8s_convert_16s_unaligned16.h \ + volk_8s_convert_32f_aligned16.h \ + volk_8s_convert_32f_unaligned16.h + +distclean-local: + rm -f volk_config_fixed.h + rm -f volk_config.h + rm -f volk_cpu.h + rm -f volk.h + rm -f volk_registry.h + rm -f volk_runtime.h + rm -f volk_typedefs.h + rm -f volk_tables.h + rm -f *.pyc + rm -f Makefile.in + rm -f volk_environment_init.h \ No newline at end of file diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml new file mode 100644 index 000000000..c9c441008 --- /dev/null +++ b/volk/include/volk/archs.xml @@ -0,0 +1,133 @@ + + + + + none + + + + maltivec + + + + 0x80000001 + m32 + d + 29 + 0 + + + + 0x80000001 + d + 29 + m64 + 1 + + + + 0x80000001 + d + 31 + m3dnow + 1 + + + + 1 + 0x80000001 + d + 5 + sse4.2 + + + + 1 + 1 + c + 23 + mpopcnt + + + + 1 + 1 + d + 23 + mmmx + + + + + 1 + 1 + d + 25 + msse + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + xmmintrin.h + + + + + 1 + 1 + d + 26 + msse2 + + + + 1 + 1 + c + 0 + msse3 + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); + pmmintrin.h + + + + 1 + 1 + c + 9 + mssse3 + + + + 1 + 0x80000001 + c + 6 + msse4a + + + + + 1 + 1 + c + 19 + msse4.1 + + + + 1 + 1 + c + 20 + msse4.2 + + + + + 1 + 1 + c + 28 + mavx + + + + diff --git a/volk/include/volk/emit_omnilog.py b/volk/include/volk/emit_omnilog.py new file mode 100644 index 000000000..309d7e578 --- /dev/null +++ b/volk/include/volk/emit_omnilog.py @@ -0,0 +1,13 @@ +def emit_prolog(): + tempstring = ""; + tempstring = tempstring + '#ifdef __cplusplus\n'; + tempstring = tempstring + 'extern "C" {\n'; + tempstring = tempstring + '#endif\n'; + return tempstring; +def emit_epilog(): + tempstring = ""; + tempstring = tempstring + '#ifdef __cplusplus\n'; + tempstring = tempstring + '}\n'; + tempstring = tempstring + '#endif\n'; + return tempstring; + diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py new file mode 100644 index 000000000..08d6d949c --- /dev/null +++ b/volk/include/volk/make_c.py @@ -0,0 +1,76 @@ +from xml.dom import minidom +import string +from volk_regexp import * + + +def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + tempstring = tempstring + '\n\n#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + for func in funclist: + tempstring = tempstring + "#include\n" ; + tempstring = tempstring + '\n'; + + tempstring = tempstring + "static inline unsigned int volk_get_index(const char** indices, const char* arch, const int* arch_defs) {\n"; + tempstring = tempstring + " int i = 1;\n" + tempstring = tempstring + " for(;i\n" + tempstring = tempstring + "#include \n" + tempstring = tempstring + "\n\n" + + for domarch in dom: + if str(domarch.attributes["type"].value) == "all": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 1;\n" + tempstring = tempstring + "}\n\n" + + else: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 0;\n" + tempstring = tempstring + "}\n\n" + + tempstring = tempstring + "void volk_cpu_init() {\n"; + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" + tempstring = tempstring + "}\n\n" + + tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; + tempstring = tempstring + " unsigned int retval = 0;\n" + tempstring = tempstring + " volk_cpu_init();\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" + tempstring = tempstring + " return retval;\n" + tempstring = tempstring + "}\n\n" + + return tempstring; diff --git a/volk/include/volk/make_cpuid_h.py b/volk/include/volk/make_cpuid_h.py new file mode 100644 index 000000000..823e3b2c0 --- /dev/null +++ b/volk/include/volk/make_cpuid_h.py @@ -0,0 +1,25 @@ +from xml.dom import minidom +from emit_omnilog import * + +def make_cpuid_h(dom) : + tempstring = ""; + tempstring = tempstring +'/*this file is auto generated by volk_register.py*/'; + tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_CPU_H'; + tempstring = tempstring +'\n#define INCLUDED_VOLK_CPU_H\n\n'; + tempstring = tempstring + emit_prolog(); + tempstring = tempstring + '\n' + + tempstring = tempstring + "struct VOLK_CPU {\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " int (*has_" + arch + ") ();\n"; + tempstring = tempstring + "}volk_cpu;\n\n"; + + tempstring = tempstring + "void volk_cpu_init ();\n" + tempstring = tempstring + "unsigned int volk_get_lvarch ();\n" + + tempstring = tempstring + "\n"; + tempstring = tempstring + emit_epilog(); + tempstring = tempstring + "#endif /*INCLUDED_VOLK_CPU_H*/\n" + + return tempstring; diff --git a/volk/include/volk/make_cpuid_powerpc_c.py b/volk/include/volk/make_cpuid_powerpc_c.py new file mode 100644 index 000000000..443a58488 --- /dev/null +++ b/volk/include/volk/make_cpuid_powerpc_c.py @@ -0,0 +1,45 @@ +from xml.dom import minidom + +def make_cpuid_powerpc_c(dom) : + tempstring = ""; + tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; + tempstring = tempstring + "#include \n" + tempstring = tempstring + "#include \n" + tempstring = tempstring + "\n\n" + + #just assume it has them for powerpc + for domarch in dom: + if str(domarch.attributes["type"].value) == "powerpc": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 1;\n" + tempstring = tempstring + "}\n\n" + elif str(domarch.attributes["type"].value) == "all": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 1;\n" + tempstring = tempstring + "}\n\n" + else: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 0;\n" + tempstring = tempstring + "}\n\n" + + + tempstring = tempstring + "void volk_cpu_init() {\n"; + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" + + tempstring = tempstring + "}\n\n" + tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; + tempstring = tempstring + " unsigned int retval = 0;\n" + tempstring = tempstring + " volk_cpu_init();\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" + tempstring = tempstring + " return retval;\n" + tempstring = tempstring + "}\n\n" + + return tempstring; + diff --git a/volk/include/volk/make_cpuid_x86_c.py b/volk/include/volk/make_cpuid_x86_c.py new file mode 100644 index 000000000..fd83e5e5c --- /dev/null +++ b/volk/include/volk/make_cpuid_x86_c.py @@ -0,0 +1,93 @@ +from xml.dom import minidom + +def make_cpuid_x86_c(dom) : + tempstring = ""; + tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; + tempstring = tempstring + "#include \n" + tempstring = tempstring + "#include \n" + tempstring = tempstring + "\n\n" + tempstring = tempstring + "extern void cpuid_x86 (unsigned int op, unsigned int result[4]);\n\n" + tempstring = tempstring + "static inline unsigned int cpuid_eax(unsigned int op) {\n"; + tempstring = tempstring + " unsigned int regs[4];\n" + tempstring = tempstring + " cpuid_x86 (op, regs);\n" + tempstring = tempstring + " return regs[0];\n" + tempstring = tempstring + "}\n\n"; + + tempstring = tempstring + "static inline unsigned int cpuid_ebx(unsigned int op) {\n"; + tempstring = tempstring + " unsigned int regs[4];\n" + tempstring = tempstring + " cpuid_x86 (op, regs);\n" + tempstring = tempstring + " return regs[1];\n" + tempstring = tempstring + "}\n\n"; + + tempstring = tempstring + "static inline unsigned int cpuid_ecx(unsigned int op) {\n"; + tempstring = tempstring + " unsigned int regs[4];\n" + tempstring = tempstring + " cpuid_x86 (op, regs);\n" + tempstring = tempstring + " return regs[2];\n" + tempstring = tempstring + "}\n\n"; + + tempstring = tempstring + "static inline unsigned int cpuid_edx(unsigned int op) {\n"; + tempstring = tempstring + " unsigned int regs[4];\n" + tempstring = tempstring + " cpuid_x86 (op, regs);\n" + tempstring = tempstring + " return regs[3];\n" + tempstring = tempstring + "}\n\n"; + + for domarch in dom: + if str(domarch.attributes["type"].value) == "x86": + arch = str(domarch.attributes["name"].value); + op = domarch.getElementsByTagName("op"); + op = str(op[0].firstChild.data); + reg = domarch.getElementsByTagName("reg"); + reg = str(reg[0].firstChild.data); + shift = domarch.getElementsByTagName("shift"); + shift = str(shift[0].firstChild.data); + val = domarch.getElementsByTagName("val"); + val = str(val[0].firstChild.data); + + if op == "1": + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " unsigned int e" + reg + "x = cpuid_e" + reg + "x (" + op + ");\n" + tempstring = tempstring + " return ((e" + reg + "x >> " + shift + ") & 1) == " + val + ";\n" + tempstring = tempstring + "}\n\n"; + + elif op == "0x80000001": + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " unsigned int extended_fct_count = cpuid_eax(0x80000000);\n"; + tempstring = tempstring + " if (extended_fct_count < 0x80000001)\n"; + tempstring = tempstring + " return "+ val + "^1;\n\n" + tempstring = tempstring + " unsigned int extended_features = cpuid_e" + reg + "x (" + op + ");\n"; + tempstring = tempstring + " return ((extended_features >> " + shift + ") & 1) == " + val + ";\n" + tempstring = tempstring + "}\n\n"; + elif str(domarch.attributes["type"].value) == "all": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 1;\n" + tempstring = tempstring + "}\n\n" + else: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 0;\n" + tempstring = tempstring + "}\n\n" + + tempstring = tempstring + "void volk_cpu_init() {\n"; + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n" + tempstring = tempstring + "}\n\n" + + tempstring = tempstring + "unsigned int volk_get_lvarch() {\n"; + tempstring = tempstring + " unsigned int retval = 0;\n" + tempstring = tempstring + " volk_cpu_init();\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n" + tempstring = tempstring + " return retval;\n" + tempstring = tempstring + "}\n\n" + + return tempstring; + + + + + + + diff --git a/volk/include/volk/make_environment_init_c.py b/volk/include/volk/make_environment_init_c.py new file mode 100644 index 000000000..e06c7f246 --- /dev/null +++ b/volk/include/volk/make_environment_init_c.py @@ -0,0 +1,33 @@ +from xml.dom import minidom + +def make_environment_init_c(dom) : + tempstring = ""; + tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; + tempstring = tempstring + "#include\n" + tempstring = tempstring + "#include\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + incs = domarch.getElementsByTagName("include"); + for inc in incs: + my_inc = str(inc.firstChild.data); + tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + "#include<" + my_inc + ">\n"; + tempstring = tempstring + "#endif\n" + tempstring = tempstring + '\n\n'; + tempstring = tempstring + "void volk_environment_init(){\n" + + for domarch in dom: + arch = str(domarch.attributes["name"].value); + envs = domarch.getElementsByTagName("environment"); + for env in envs: + cmd = str(env.firstChild.data); + tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + " " + cmd + "\n"; + tempstring = tempstring + "#endif\n" + + tempstring = tempstring + "}\n"; + return tempstring; + + + + diff --git a/volk/include/volk/make_environment_init_h.py b/volk/include/volk/make_environment_init_h.py new file mode 100644 index 000000000..77a841a24 --- /dev/null +++ b/volk/include/volk/make_environment_init_h.py @@ -0,0 +1,18 @@ +from xml.dom import minidom +from emit_omnilog import * + +def make_environment_init_h() : + tempstring = ""; + tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; + tempstring = tempstring + "#ifndef INCLUDE_LIBVECTOR_ENVIRONMENT_INIT_H\n"; + tempstring = tempstring + "#define INCLUDE_LIBVECTOR_ENVIRONMENT_INIT_H\n"; + tempstring = tempstring + "\n"; + tempstring = tempstring + emit_prolog(); + tempstring = tempstring + "void volk_environment_init();\n"; + tempstring = tempstring + emit_epilog(); + tempstring = tempstring + "#endif\n" + return tempstring; + + + + diff --git a/volk/include/volk/make_h.py b/volk/include/volk/make_h.py new file mode 100644 index 000000000..81d9ad401 --- /dev/null +++ b/volk/include/volk/make_h.py @@ -0,0 +1,28 @@ +from xml.dom import minidom +from emit_omnilog import * +from volk_regexp import * + + + +def make_h(funclist, arched_arglist, retlist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_H'; + tempstring = tempstring + '\n#define INCLUDED_VOLK_H'; + tempstring = tempstring + '\n\n#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n' + tempstring = tempstring + emit_prolog() + tempstring = tempstring + '\n'; + + for i in range(len(retlist)): + tempstring = tempstring + retlist[i] + funclist[i] + replace_bracket.sub(";", replace_arch.sub("", arched_arglist[i])) + '\n'; + tempstring = tempstring + retlist[i] + funclist[i] + "_manual" + replace_bracket.sub(";", arched_arglist[i]) + '\n'; + + tempstring = tempstring + emit_epilog(); + + tempstring = tempstring + "#endif /*INCLUDED_VOLK_H*/\n"; + + return tempstring; diff --git a/volk/include/volk/make_init_c.py b/volk/include/volk/make_init_c.py new file mode 100644 index 000000000..330e19592 --- /dev/null +++ b/volk/include/volk/make_init_c.py @@ -0,0 +1,42 @@ +from xml.dom import minidom + +def make_init_c(funclist, dom) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + + tempstring = tempstring + '\n\n#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + for domarch in dom: + arch = str(domarch.attributes["name"].value); + incs = domarch.getElementsByTagName("include"); + for inc in incs: + my_inc = str(inc.firstChild.data); + tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + "#include<" + my_inc + ">\n"; + tempstring = tempstring + "#endif\n" + tempstring = tempstring + '\n\n'; + + tempstring = tempstring + "extern struct VOLK_RUNTIME volk_runtime;\n\n"; + tempstring = tempstring + "struct VOLK_RUNTIME* get_volk_runtime(){\n"; + tempstring = tempstring + " return &volk_runtime;\n"; + tempstring = tempstring + "}\n\n" + tempstring = tempstring + " void volk_runtime_init() {\nvolk_cpu_init();\n"; + + for func in funclist: + tempstring = tempstring + " volk_runtime." + func + " = default_acquire_" + func + ";\n"; + + for domarch in dom: + arch = str(domarch.attributes["name"].value); + envs = domarch.getElementsByTagName("environment"); + for env in envs: + cmd = str(env.firstChild.data); + tempstring = tempstring + " if(volk_cpu.has_" + arch + "()){\n"; + tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring + " " + cmd + "\n"; + tempstring = tempstring + "#endif\n" + tempstring = tempstring + " }\n"; + + tempstring = tempstring + "}\n"; + + return tempstring diff --git a/volk/include/volk/make_init_h.py b/volk/include/volk/make_init_h.py new file mode 100644 index 000000000..6dbe1c585 --- /dev/null +++ b/volk/include/volk/make_init_h.py @@ -0,0 +1,26 @@ +from xml.dom import minidom +from emit_omnilog import * +from volk_regexp import * + + + +def make_init_h(funclist, arched_arglist, retlist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + + tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_INIT_H'; + tempstring = tempstring + '\n#define INCLUDED_VOLK_INIT_H'; + tempstring = tempstring + '\n\n#include\n'; + tempstring = tempstring + '#include\n'; + + tempstring = tempstring + '\n'; + + tempstring = tempstring + emit_prolog(); + + for i in range(len(retlist)): + tempstring = tempstring + retlist[i] + " default_acquire_" + funclist[i] + replace_bracket.sub(";", replace_arch.sub("", arched_arglist[i])) + '\n'; + + tempstring= tempstring + emit_epilog(); + tempstring = tempstring + "#endif /*INCLUDED_VOLK_INIT_H*/\n"; + + return tempstring; diff --git a/volk/include/volk/make_mktables.py b/volk/include/volk/make_mktables.py new file mode 100644 index 000000000..051ac268d --- /dev/null +++ b/volk/include/volk/make_mktables.py @@ -0,0 +1,33 @@ + + +def make_mktables(funclist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n'; + + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstrgin = tempstring + '#include\n'; + tempstring = tempstring + "\n\n"; + + tempstring = tempstring + 'int main() {\n'; + tempstring = tempstring + ' int i = 0;\n'; + tempstring = tempstring + ' FILE* output;\n'; + tempstring = tempstring + ' output = fopen("volk_tables.h", "w");\n'; + tempstring = tempstring + ' fprintf(output, "#ifndef INCLUDED_VOLK_TABLES_H\\n");\n'; + tempstring = tempstring + ' fprintf(output, "#define INCLUDED_VOLK_TABLES_H\\n\\n");\n'; + + for func in funclist: + tempstring = tempstring + ' fprintf(output, "static const ' + func + '_func_table = %u;\\n", volk_rank_archs(' + func + '_arch_defs, volk_get_lvarch()));\n'; + tempstring = tempstring + ' fprintf(output, "#endif /*INCLUDED_VOLK_TABLES_H*/\\n");\n'; + tempstring = tempstring + ' fclose(output);\n' + tempstring = tempstring + '}\n'; + return tempstring; + + + + + + + + diff --git a/volk/include/volk/make_proccpu_sim.py b/volk/include/volk/make_proccpu_sim.py new file mode 100644 index 000000000..c75a4d5fb --- /dev/null +++ b/volk/include/volk/make_proccpu_sim.py @@ -0,0 +1,25 @@ +from xml.dom import minidom + +def make_proccpu_sim(dom) : + tempstring = ""; + tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; + tempstring = tempstring + "#include \n" + tempstring = tempstring + "#include \n" + tempstring = tempstring + "\n\n" + + tempstring = tempstring + "void test_append(char* buf, int val, char* newkey){\n"; + tempstring = tempstring + " if(val==1){\n"; + tempstring = tempstring + " sprintf(buf, \"%s %s\", buf, newkey);\n"; + tempstring = tempstring + " }\n"; + tempstring = tempstring + "}\n"; + tempstring = tempstring + "\n\n"; + + tempstring = tempstring + "int main() {\n"; + tempstring = tempstring + " volk_cpu_init();\n"; + tempstring = tempstring + " char buf[2048];\n"; + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " test_append(buf, volk_cpu.has_" + arch + "(), \"" + arch + "\");\n" + tempstring = tempstring + " printf(\"%s\\n\", buf);\n" + tempstring = tempstring + "}\n" + return tempstring; diff --git a/volk/include/volk/make_registry.py b/volk/include/volk/make_registry.py new file mode 100644 index 000000000..8457d61f3 --- /dev/null +++ b/volk/include/volk/make_registry.py @@ -0,0 +1,62 @@ +from xml.dom import minidom +from emit_omnilog import * +import string + +def make_registry(dom, funclist, fcountlist) : + tempstring = ""; + tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; + tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_REGISTRY_H'; + tempstring = tempstring +'\n#define INCLUDED_VOLK_REGISTRY_H\n\n'; + tempstring = tempstring +'#include\n'; + tempstring = tempstring +'#include\n'; + tempstring = tempstring + emit_prolog(); + tempstring = tempstring + '\n' + + + + + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring +"#if LV_HAVE_" + arch.swapcase() + "\n"; + tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 1\n"; + tempstring = tempstring +"#else\n"; + tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 0\n"; + tempstring = tempstring +"#endif /*LV_HAVE_" + arch.swapcase() + "*/\n\n"; + + counter = 0; + for fcount in fcountlist: + tempstring = tempstring + "static const int " + funclist[counter] + "_arch_defs[] = {\n"; + counter = counter + 1; + for arch_list in fcount: + tempstring = tempstring + " (LV_" + for ind in range(len(arch_list)): + tempstring = tempstring + arch_list[ind] + "_CNT"; + if ind < len(arch_list) - 1: + tempstring = tempstring + " * LV_"; + tempstring = tempstring + ") + "; + lindex = tempstring.rfind(" + "); + tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], " + ", ""); + tempstring = tempstring + ",\n" + for arch_list in fcount: + tempstring = tempstring + "#if LV_HAVE_" + for ind in range(len(arch_list)): + tempstring = tempstring + arch_list[ind]; + if ind < len(arch_list) - 1: + tempstring = tempstring + " && LV_HAVE_"; + tempstring = tempstring + "\n" + tempstring = tempstring + " (1 << LV_" + for ind in range(len(arch_list)): + tempstring = tempstring + arch_list[ind]; + if ind < len(arch_list) - 1: + tempstring = tempstring + ") + (1 << LV_" + tempstring = tempstring + "),\n#endif\n" + lindex = tempstring.rfind(","); + tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); + tempstring = tempstring + "};\n\n" + + + tempstring = tempstring + emit_epilog(); + tempstring = tempstring +"#endif /*INCLUDED_VOLK_REGISTRY_H*/\n"; + + return tempstring; + diff --git a/volk/include/volk/make_runtime.py b/volk/include/volk/make_runtime.py new file mode 100644 index 000000000..645b3aaee --- /dev/null +++ b/volk/include/volk/make_runtime.py @@ -0,0 +1,34 @@ +from xml.dom import minidom +from emit_omnilog import * +from volk_regexp import * + + + +def make_runtime(funclist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n'; + + tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_RUNTIME'; + tempstring = tempstring + '\n#define INCLUDED_VOLK_RUNTIME'; + tempstring = tempstring + '\n\n#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + emit_prolog(); + + tempstring = tempstring + '\n'; + + tempstring = tempstring + "struct VOLK_RUNTIME {\n"; + + for i in range(len(funclist)): + tempstring = tempstring + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + ";\n"; + tempstring = tempstring + "};\n\n"; + + tempstring = tempstring + "struct VOLK_RUNTIME* get_volk_runtime();\n\n" + tempstring = tempstring + "\nvoid volk_runtime_init();\n"; + + tempstring = tempstring + emit_epilog(); + tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n"; + + return tempstring; + diff --git a/volk/include/volk/make_runtime_c.py b/volk/include/volk/make_runtime_c.py new file mode 100644 index 000000000..070df9ba7 --- /dev/null +++ b/volk/include/volk/make_runtime_c.py @@ -0,0 +1,47 @@ +from xml.dom import minidom +import string +from volk_regexp import * + + +def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + + + tempstring = tempstring + '\n\n#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + "#include\n"; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + tempstring = tempstring + '#include\n'; + + for func in funclist: + tempstring = tempstring + "#include\n" ; + tempstring = tempstring + '\n'; + + tempstring = tempstring + "struct VOLK_RUNTIME volk_runtime;\n"; + + for i in range(len(funclist)): + tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n"; + + tags_counter = 0; + for arch_list in fcountlist[i]: + tempstring = tempstring + "#if LV_HAVE_" + for ind in range(len(arch_list)): + + tempstring = tempstring + arch_list[ind]; + if ind < len(arch_list) - 1: + tempstring = tempstring + " && LV_HAVE_"; + + tempstring = tempstring + "\n " + funclist[i] + "_" + str(taglist[i][tags_counter]) + ",\n#endif\n"; + tags_counter = tags_counter + 1; + + lindex = tempstring.rfind(","); + tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", ""); + tempstring = tempstring + "};\n\n"; + + + tempstring = tempstring + retlist[i] + "default_acquire_" + funclist[i] + replace_arch.sub("", arched_arglist[i]) + '\n'; + tempstring = tempstring + "volk_runtime." + funclist[i] + " = " + funclist[i] + "_archs[volk_rank_archs(" + funclist[i] + "_arch_defs, volk_get_lvarch())];\n" + "return " + funclist[i] + "_archs[volk_rank_archs(" + funclist[i] + "_arch_defs, volk_get_lvarch())](" + my_arglist[i] + ");" + '\n}\n'; + + return tempstring; diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py new file mode 100644 index 000000000..e320bc748 --- /dev/null +++ b/volk/include/volk/make_set_simd.py @@ -0,0 +1,202 @@ +from xml.dom import minidom + +def make_set_simd(dom) : + tempstring = ""; + tempstring = tempstring +'dnl this file is auto generated by volk_register.py\n\n'; + + tempstring = tempstring + "AC_DEFUN([_MAKE_FAKE_PROCCPU],\n"; + tempstring = tempstring + "[\n"; + tempstring = tempstring + " AC_REQUIRE([GR_SET_MD_CPU])\n"; + tempstring = tempstring + " AC_MSG_CHECKING([proccpu])\n"; + tempstring = tempstring + " case \"$MD_CPU\" in\n"; + tempstring = tempstring + " (x86)\n"; + tempstring = tempstring + " case \"$MD_SUBCPU\" in\n"; + tempstring = tempstring + " (x86)\n"; + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ -I./lib lib/volk_proccpu_sim.c lib/volk_cpu_x86.c lib/cpuid_x86.S 2>&1`\"\n"; + tempstring = tempstring + " then\n"; + tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; + tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; + tempstring = tempstring + " rm -f proccpu\n"; + tempstring = tempstring + " else\n"; + tempstring = tempstring + " AC_MSG_RESULT(no)\n"; + tempstring = tempstring + " lv_PROCCPU=no\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " ;;\n" + tempstring = tempstring + " (*)\n" + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ -I./lib lib/volk_proccpu_sim.c lib/volk_cpu_x86.c lib/cpuid_x86_64.S 2>&1`\"\n"; + tempstring = tempstring + " then\n"; + tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; + tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; + tempstring = tempstring + " rm -f proccpu\n"; + tempstring = tempstring + " else\n"; + tempstring = tempstring + " AC_MSG_RESULT(no)\n"; + tempstring = tempstring + " lv_PROCCPU=no\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " ;;\n" + tempstring = tempstring + " esac\n" + tempstring = tempstring + " ;;\n"; + tempstring = tempstring + " (powerpc)\n"; + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ lib/volk_proccpu_sim.c lib/volk_cpu_powerpc.c 2>&1`\"\n"; + tempstring = tempstring + " then\n"; + tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; + tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; + tempstring = tempstring + " rm -f proccpu\n"; + tempstring = tempstring + " else\n"; + tempstring = tempstring + " AC_MSG_RESULT(no)\n"; + tempstring = tempstring + " lv_PROCCPU=no\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " ;;\n"; + tempstring = tempstring + " (*)\n"; + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ lib/volk_proccpu_sim.c lib/volk_cpu_generic.c 2>&1`\"\n"; + tempstring = tempstring + " then\n"; + tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; + tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; + tempstring = tempstring + " rm -f proccpu\n"; + tempstring = tempstring + " else\n"; + tempstring = tempstring + " AC_MSG_RESULT(no)\n"; + tempstring = tempstring + " lv_PROCCPU=no\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " ;;\n"; + tempstring = tempstring + " esac\n"; + tempstring = tempstring + "])\n" + + for domarch in dom: + if str(domarch.attributes["type"].value) != "all": + arch = str(domarch.attributes["name"].value); + flag = domarch.getElementsByTagName("flag"); + flag = str(flag[0].firstChild.data); + tempstring = tempstring + "AC_DEFUN([_TRY_ADD_" + arch.swapcase() + "],\n"; + tempstring = tempstring + "[\n"; + tempstring = tempstring + " LF_CHECK_CC_FLAG([-" + flag + "])\n"; + tempstring = tempstring + " LF_CHECK_CXX_FLAG([-" + flag + "])\n"; + tempstring = tempstring + "])\n"; + + tempstring = tempstring + "AC_DEFUN([LV_SET_SIMD_FLAGS],\n"; + tempstring = tempstring + "[\n"; + tempstring = tempstring + " AC_REQUIRE([GR_SET_MD_CPU])\n"; + tempstring = tempstring + " AC_SUBST(LV_CXXFLAGS)\n"; + tempstring = tempstring + " indCC=no\n"; + tempstring = tempstring + " indCXX=no\n"; + tempstring = tempstring + " indLV_ARCH=no\n"; + tempstring = tempstring + " AC_ARG_WITH(lv_arch,\n"; + tempstring = tempstring + " AC_HELP_STRING([--with-lv_arch=ARCH],[set volk hardware speedups as space separated string with elements from the following list("; + + for domarch in dom: + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + arch + ", " + tempstring = tempstring[0:len(tempstring) - 2]; + + tempstring = tempstring + ")]),\n"; + tempstring = tempstring + " [cf_with_lv_arch=\"$withval\"],\n"; + tempstring = tempstring + " [cf_with_lv_arch=\"\"])\n"; + if str(domarch.attributes["type"].value) == "all": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; + tempstring = tempstring + " ADDONS=\"\"\n"; + tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; + tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n"; + tempstring = tempstring + " cf_with_lv_arch=$lv_PROCCPU\n"; + + tempstring = tempstring + " fi\n"; + tempstring = tempstring + " echo $cf_with_lv_arch\n"; + for domarch in dom: + if str(domarch.attributes["type"].value) != "all": + arch = str(domarch.attributes["name"].value); + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=no\n"; + + tempstring = tempstring + " case \"$MD_CPU\" in\n"; + tempstring = tempstring + " (x86)\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + atype = str(domarch.attributes["type"].value); + if atype == "x86": + tempstring = tempstring + " _TRY_ADD_" + arch.swapcase() + "\n"; + + for domarch in dom: + arch = str(domarch.attributes["name"].value); + atype = str(domarch.attributes["type"].value); + flag = domarch.getElementsByTagName("flag"); + flag = str(flag[0].firstChild.data); + if atype == "x86": + tempstring = tempstring + " for i in $lf_CXXFLAGS\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; + tempstring = tempstring + " indCXX=yes\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " for i in $lf_CFLAGS\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; + tempstring = tempstring + " indCC=yes\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " for i in $cf_with_lv_arch\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; + tempstring = tempstring + " indLV_ARCH=yes\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " indCC=no\n" + tempstring = tempstring + " indCXX=no\n" + tempstring = tempstring + " indLV_ARCH=no\n" + elif atype == "all": + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " ;;\n" + + tempstring = tempstring + " (powerpc)\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + atype = str(domarch.attributes["type"].value); + if atype == "powerpc": + tempstring = tempstring + " _TRY_ADD_" + arch.swapcase() + "\n"; + + for domarch in dom: + arch = str(domarch.attributes["name"].value); + atype = str(domarch.attributes["type"].value); + flag = domarch.getElementsByTagName("flag"); + flag = str(flag[0].firstChild.data); + if atype == "powerpc": + tempstring = tempstring + " for i in $lf_CXXFLAGS\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; + tempstring = tempstring + " indCXX=yes\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " for i in $lf_CFLAGS\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X-" + flag +"; then\n"; + tempstring = tempstring + " indCC=yes\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " for i in $cf_with_lv_arch\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; + tempstring = tempstring + " indLV_ARCH=yes\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " if test \"$indCC\" = yes && test \"indCXX\" = yes && \"indLV_ARCH\" = yes; then\n" + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " indCC=no\n" + tempstring = tempstring + " indCXX=no\n" + tempstring = tempstring + " indLV_ARCH=no\n" + elif atype == "all": + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " ;;\n" + tempstring = tempstring + " esac\n" + tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" + tempstring = tempstring + "])\n" + + return tempstring; + + + diff --git a/volk/include/volk/make_typedefs.py b/volk/include/volk/make_typedefs.py new file mode 100644 index 000000000..fe81cb2b0 --- /dev/null +++ b/volk/include/volk/make_typedefs.py @@ -0,0 +1,23 @@ +from xml.dom import minidom +import string +from volk_regexp import * + + + +def make_typedefs(funclist, retlist, my_argtypelist) : + tempstring = ""; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; + tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_TYPEDEFS'; + tempstring = tempstring + '\n#define INCLUDED_VOLK_TYPEDEFS\n'; + tempstring = tempstring + '\n\n#include\n'; + tempstring = tempstring + '#include\n'; + + tempstring = tempstring + '\n'; + + for i in range(len(funclist)): + tempstring = tempstring + "typedef " + retlist[i] +" (*" + replace_volk.sub("p", funclist[i]) + ")(" + my_argtypelist[i] + ");\n\n"; + + tempstring = tempstring + "#endif /*INCLUDED_VOLK_TYPEDEFS*/\n"; + + return tempstring; diff --git a/volk/include/volk/volk_16s_add_quad_aligned16.h b/volk/include/volk/volk_16s_add_quad_aligned16.h new file mode 100644 index 000000000..63042bef1 --- /dev/null +++ b/volk/include/volk/volk_16s_add_quad_aligned16.h @@ -0,0 +1,136 @@ +#ifndef INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H +#define INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H + + +#include +#include + + + + + +#if LV_HAVE_SSE2 +#include +#include + +static inline void volk_16s_add_quad_aligned16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; + p_target0 = (__m128i*)target0; + p_target1 = (__m128i*)target1; + p_target2 = (__m128i*)target2; + p_target3 = (__m128i*)target3; + + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + p_src4 = (__m128i*)src4; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + for(; i < bound; ++i) { + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(p_src1); + xmm2 = _mm_load_si128(p_src2); + xmm3 = _mm_load_si128(p_src3); + xmm4 = _mm_load_si128(p_src4); + + p_src0 += 1; + p_src1 += 1; + + xmm1 = _mm_add_epi16(xmm0, xmm1); + xmm2 = _mm_add_epi16(xmm0, xmm2); + xmm3 = _mm_add_epi16(xmm0, xmm3); + xmm4 = _mm_add_epi16(xmm0, xmm4); + + + p_src2 += 1; + p_src3 += 1; + p_src4 += 1; + + _mm_store_si128(p_target0, xmm1); + _mm_store_si128(p_target1, xmm2); + _mm_store_si128(p_target2, xmm3); + _mm_store_si128(p_target3, xmm4); + + p_target0 += 1; + p_target1 += 1; + p_target2 += 1; + p_target3 += 1; + } + /*asm volatile + ( + ".%=volk_16s_add_quad_aligned16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je .%=volk_16s_add_quad_aligned16_sse2_END\n\t" + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + "movaps (%[src4]), %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $16, %[src1]\n\t" + "add $16, %[src2]\n\t" + "add $16, %[src3]\n\t" + "add $16, %[src4]\n\t" + "paddw %%xmm1, %%xmm2\n\t" + "paddw %%xmm1, %%xmm3\n\t" + "paddw %%xmm1, %%xmm4\n\t" + "paddw %%xmm1, %%xmm5\n\t" + "add $-1, %[bound]\n\t" + "movaps %%xmm2, (%[target0])\n\t" + "movaps %%xmm3, (%[target1])\n\t" + "movaps %%xmm4, (%[target2])\n\t" + "movaps %%xmm5, (%[target3])\n\t" + "add $16, %[target0]\n\t" + "add $16, %[target1]\n\t" + "add $16, %[target2]\n\t" + "add $16, %[target3]\n\t" + "jmp .%=volk_16s_add_quad_aligned16_sse2_L1\n\t" + ".%=volk_16s_add_quad_aligned16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) + :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + + */ + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} +#endif /*LV_HAVE_SSE2*/ + + +#if LV_HAVE_GENERIC + +static inline void volk_16s_add_quad_aligned16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} + +#endif /* LV_HAVE_GENERIC */ + + + + + +#endif /*INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h b/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h new file mode 100644 index 000000000..fb9d7cb87 --- /dev/null +++ b/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h @@ -0,0 +1,194 @@ +#ifndef INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H +#define INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H + + +#include +#include + + + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16s_branch_4_state_8_aligned16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; + + __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; + + + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = 1; + + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + xmm0 = _mm_load_si128((__m128i*)permuters[0]); + xmm6 = _mm_load_si128((__m128i*)permuters[1]); + xmm8 = _mm_load_si128((__m128i*)permuters[2]); + xmm10 = _mm_load_si128((__m128i*)permuters[3]); + + for(; i < bound; ++i) { + + xmm5 = _mm_load_si128(p_src0); + + + + + + + + + + xmm0 = _mm_shuffle_epi8(xmm5, xmm0); + xmm6 = _mm_shuffle_epi8(xmm5, xmm6); + xmm8 = _mm_shuffle_epi8(xmm5, xmm8); + xmm10 = _mm_shuffle_epi8(xmm5, xmm10); + + p_src0 += 4; + + + xmm5 = _mm_add_epi16(xmm1, xmm2); + + xmm6 = _mm_add_epi16(xmm2, xmm6); + xmm8 = _mm_add_epi16(xmm1, xmm8); + + + xmm7 = _mm_load_si128(p_cntl2); + xmm9 = _mm_load_si128(p_cntl3); + + xmm0 = _mm_add_epi16(xmm5, xmm0); + + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm5 = _mm_load_si128(&p_cntl2[1]); + xmm11 = _mm_load_si128(&p_cntl3[1]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm7); + + + + xmm7 = _mm_load_si128(&p_cntl2[2]); + xmm9 = _mm_load_si128(&p_cntl3[2]); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm6 = _mm_add_epi16(xmm6, xmm5); + + + xmm5 = _mm_load_si128(&p_cntl2[3]); + xmm11 = _mm_load_si128(&p_cntl3[3]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm8 = _mm_add_epi16(xmm8, xmm7); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + _mm_store_si128(p_target, xmm0); + _mm_store_si128(&p_target[1], xmm6); + + xmm10 = _mm_add_epi16(xmm5, xmm10); + + _mm_store_si128(&p_target[2], xmm8); + + _mm_store_si128(&p_target[3], xmm10); + + p_target += 3; + } +} + + +#endif /*LV_HAVE_SSEs*/ + +#if LV_HAVE_GENERIC +static inline void volk_16s_branch_4_state_8_aligned16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + int i = 0; + + int bound = 4; + + for(; i < bound; ++i) { + target[i* 8] = src0[((char)permuters[i][0])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8] & scalars[2]) + + (cntl3[i * 8] & scalars[3]); + target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 1] & scalars[2]) + + (cntl3[i * 8 + 1] & scalars[3]); + target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 2] & scalars[2]) + + (cntl3[i * 8 + 2] & scalars[3]); + target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 3] & scalars[2]) + + (cntl3[i * 8 + 3] & scalars[3]); + target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 4] & scalars[2]) + + (cntl3[i * 8 + 4] & scalars[3]); + target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 5] & scalars[2]) + + (cntl3[i * 8 + 5] & scalars[3]); + target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 6] & scalars[2]) + + (cntl3[i * 8 + 6] & scalars[3]); + target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 7] & scalars[2]) + + (cntl3[i * 8 + 7] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_convert_32f_aligned16.h b/volk/include/volk/volk_16s_convert_32f_aligned16.h new file mode 100644 index 000000000..126ce1528 --- /dev/null +++ b/volk/include/volk/volk_16s_convert_32f_aligned16.h @@ -0,0 +1,119 @@ +#ifndef INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H +#define INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16s_convert_32f_aligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16s_convert_32f_aligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16s_convert_32f_aligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_convert_32f_unaligned16.h b/volk/include/volk/volk_16s_convert_32f_unaligned16.h new file mode 100644 index 000000000..d6212fba5 --- /dev/null +++ b/volk/include/volk/volk_16s_convert_32f_unaligned16.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H +#define INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16s_convert_32f_unaligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16s_convert_32f_unaligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16s_convert_32f_unaligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_convert_8s_aligned16.h b/volk/include/volk/volk_16s_convert_8s_aligned16.h new file mode 100644 index 000000000..64c368688 --- /dev/null +++ b/volk/include/volk/volk_16s_convert_8s_aligned16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H +#define INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16s_convert_8s_aligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16s_convert_8s_aligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_convert_8s_unaligned16.h b/volk/include/volk/volk_16s_convert_8s_unaligned16.h new file mode 100644 index 000000000..ca925de86 --- /dev/null +++ b/volk/include/volk/volk_16s_convert_8s_unaligned16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H +#define INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned +*/ +static inline void volk_16s_convert_8s_unaligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned +*/ +static inline void volk_16s_convert_8s_unaligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_max_star_aligned16.h b/volk/include/volk/volk_16s_max_star_aligned16.h new file mode 100644 index 000000000..ba4e979ec --- /dev/null +++ b/volk/include/volk/volk_16s_max_star_aligned16.h @@ -0,0 +1,108 @@ +#ifndef INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H +#define INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H + + +#include +#include + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16s_max_star_aligned16_ssse3(short* target, short* src0, unsigned int num_bytes) { + + + + short candidate = src0[0]; + short cands[8]; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; + + + __m128i *p_src0; + + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + xmm1 = _mm_setzero_si128(); + xmm0 = _mm_setzero_si128(); + //_mm_insert_epi16(xmm0, candidate, 0); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm1); + + + for(i = 0; i < bound; ++i) { + xmm1 = _mm_load_si128(p_src0); + p_src0 += 1; + xmm2 = _mm_sub_epi16(xmm1, xmm0); + + + + + + + xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); + xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); + xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); + + xmm6 = _mm_xor_si128(xmm4, xmm5); + + xmm3 = _mm_and_si128(xmm3, xmm0); + xmm4 = _mm_and_si128(xmm6, xmm1); + + xmm0 = _mm_add_epi16(xmm3, xmm4); + + + } + + _mm_store_si128((__m128i*)cands, xmm0); + + for(i = 0; i < 8; ++i) { + candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; + } + + + + for(i = 0; i < leftovers; ++i) { + + candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; + } + + target[0] = candidate; + + + + + +} + +#endif /*LV_HAVE_SSSE3*/ + +#if LV_HAVE_GENERIC + +static inline void volk_16s_max_star_aligned16_generic(short* target, short* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short candidate = src0[0]; + for(i = 1; i < bound; ++i) { + candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; + } + target[0] = candidate; + +} + + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h b/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h new file mode 100644 index 000000000..82d011677 --- /dev/null +++ b/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h @@ -0,0 +1,130 @@ +#ifndef INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H +#define INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H + + +#include +#include + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16s_max_star_horizontal_aligned16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; + const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; + + + + volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i xmm5, xmm6, xmm7, xmm8; + + xmm4 = _mm_load_si128((__m128i*)shufmask0); + xmm5 = _mm_load_si128((__m128i*)shufmask1); + xmm6 = _mm_load_si128((__m128i*)andmask0); + xmm7 = _mm_load_si128((__m128i*)andmask1); + + __m128i *p_target, *p_src0; + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 5; + int intermediate = (num_bytes >> 4) & 1; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + for(i = 0; i < bound; ++i) { + + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(&p_src0[1]); + + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 2; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + xmm3 = _mm_and_si128(xmm2, xmm7); + + + xmm8 = _mm_add_epi8(xmm8, xmm4); + xmm3 = _mm_add_epi8(xmm3, xmm5); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm8); + xmm1 = _mm_shuffle_epi8(xmm1, xmm3); + + + xmm3 = _mm_add_epi16(xmm0, xmm1); + + + _mm_store_si128(p_target, xmm3); + + p_target += 1; + + } + + for(i = 0; i < intermediate; ++i) { + + xmm0 = _mm_load_si128(p_src0); + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 1; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + + xmm3 = _mm_add_epi8(xmm8, xmm4); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm3); + + + _mm_storel_pd((double*)p_target, (__m128d)xmm0); + + p_target = (__m128i*)((int8_t*)p_target + 8); + + } + + for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { + target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; + } + + +} + +#endif /*LV_HAVE_SSSE3*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16s_max_star_horizontal_aligned16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + + for(i = 0; i < bound; i += 2) { + target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; + } + +} + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h new file mode 100644 index 000000000..452d05c4f --- /dev/null +++ b/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h @@ -0,0 +1,139 @@ +#ifndef INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H +#define INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H + + +#include +#include + + + + +#if LV_HAVE_SSE2 + +#include +#include + +static inline void volk_16s_permute_and_scalar_add_aligned16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; + + short* p_permute_indexes = permute_indexes; + + p_target = (__m128i*)target; + p_cntl0 = (__m128i*)cntl0; + p_cntl1 = (__m128i*)cntl1; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + + for(; i < bound; ++i) { + xmm0 = _mm_setzero_si128(); + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = _mm_setzero_si128(); + + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_permute_indexes += 8; + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + xmm5 = _mm_load_si128(p_cntl0); + xmm6 = _mm_load_si128(p_cntl1); + xmm7 = _mm_load_si128(p_cntl2); + + xmm5 = _mm_and_si128(xmm5, xmm1); + xmm6 = _mm_and_si128(xmm6, xmm2); + xmm7 = _mm_and_si128(xmm7, xmm3); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + xmm5 = _mm_load_si128(p_cntl3); + + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_cntl0 += 1; + + xmm5 = _mm_and_si128(xmm5, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + p_cntl1 += 1; + p_cntl2 += 1; + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + p_cntl3 += 1; + + _mm_store_si128(p_target, xmm0); + + p_target += 1; + } + + + + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + } +} +#endif /*LV_HAVE_SSEs*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16s_permute_and_scalar_add_aligned16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_quad_max_star_aligned16.h b/volk/include/volk/volk_16s_quad_max_star_aligned16.h new file mode 100644 index 000000000..1004c4d23 --- /dev/null +++ b/volk/include/volk/volk_16s_quad_max_star_aligned16.h @@ -0,0 +1,191 @@ +#ifndef INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H +#define INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H + + +#include +#include + + + + + +#if LV_HAVE_SSE2 + +#include + +static inline void volk_16s_quad_max_star_aligned16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + + + + int i = 0; + + int bound = (num_bytes >> 4); + int bound_copy = bound; + int leftovers = (num_bytes >> 1) & 7; + + __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; + p_target = (__m128i*) target; + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + + + + __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + while(bound_copy > 0) { + + xmm1 = _mm_load_si128(p_src0); + xmm2 = _mm_load_si128(p_src1); + xmm3 = _mm_load_si128(p_src2); + xmm4 = _mm_load_si128(p_src3); + + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = xmm1; + xmm8 = xmm3; + + + xmm1 = _mm_sub_epi16(xmm2, xmm1); + + + + xmm3 = _mm_sub_epi16(xmm4, xmm3); + + xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); + xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); + + + + xmm2 = _mm_and_si128(xmm5, xmm2); + xmm4 = _mm_and_si128(xmm6, xmm4); + xmm5 = _mm_andnot_si128(xmm5, xmm7); + xmm6 = _mm_andnot_si128(xmm6, xmm8); + + xmm5 = _mm_add_epi16(xmm2, xmm5); + xmm6 = _mm_add_epi16(xmm4, xmm6); + + + xmm1 = _mm_xor_si128(xmm1, xmm1); + xmm2 = xmm5; + xmm5 = _mm_sub_epi16(xmm6, xmm5); + p_src0 += 1; + bound_copy -= 1; + + xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); + p_src1 += 1; + + xmm6 = _mm_and_si128(xmm1, xmm6); + + xmm1 = _mm_andnot_si128(xmm1, xmm2); + p_src2 += 1; + + + + xmm1 = _mm_add_epi16(xmm6, xmm1); + p_src3 += 1; + + + _mm_store_si128(p_target, xmm1); + p_target += 1; + + } + + + /*asm volatile + ( + "volk_16s_quad_max_star_aligned16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je volk_16s_quad_max_star_aligned16_sse2_END\n\t" + + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "movaps %%xmm1, %%xmm7\n\t" + "movaps %%xmm3, %%xmm8\n\t" + "psubw %%xmm2, %%xmm1\n\t" + "psubw %%xmm4, %%xmm3\n\t" + + "pcmpgtw %%xmm1, %%xmm5\n\t" + "pcmpgtw %%xmm3, %%xmm6\n\t" + + "pand %%xmm5, %%xmm2\n\t" + "pand %%xmm6, %%xmm4\n\t" + "pandn %%xmm7, %%xmm5\n\t" + "pandn %%xmm8, %%xmm6\n\t" + + "paddw %%xmm2, %%xmm5\n\t" + "paddw %%xmm4, %%xmm6\n\t" + + "pxor %%xmm1, %%xmm1\n\t" + "movaps %%xmm5, %%xmm2\n\t" + + "psubw %%xmm6, %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $-1, %[bound]\n\t" + + "pcmpgtw %%xmm5, %%xmm1\n\t" + "add $16, %[src1]\n\t" + + "pand %%xmm1, %%xmm6\n\t" + + "pandn %%xmm2, %%xmm1\n\t" + "add $16, %[src2]\n\t" + + "paddw %%xmm6, %%xmm1\n\t" + "add $16, %[src3]\n\t" + + "movaps %%xmm1, (%[target])\n\t" + "addw $16, %[target]\n\t" + "jmp volk_16s_quad_max_star_aligned16_sse2_L1\n\t" + + "volk_16s_quad_max_star_aligned16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) + : + ); + */ + + short temp0 = 0; + short temp1 = 0; + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } + return; + + +} + +#endif /*LV_HAVE_SSE2*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16s_quad_max_star_aligned16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short temp0 = 0; + short temp1 = 0; + for(i = 0; i < bound; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } +} + + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h new file mode 100644 index 000000000..32e13df98 --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h @@ -0,0 +1,146 @@ +#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H +#define INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_16s_aligned16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); + __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); + qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = *int16ComplexVectorPtr++; + *qBufferPtr++ = *int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_16s_aligned16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); + + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..86f67437d --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h @@ -0,0 +1,95 @@ +#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16sc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + uint64_t number = 0; + const uint64_t quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16sc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..b594c85b8 --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H +#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_16s_aligned16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + iOutputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_16s_aligned16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i complexVal1, complexVal2, iOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..3e7be1e64 --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,125 @@ +#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H +#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_32f_aligned16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + + for(;number < quarterPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi16_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; + sixteenTComplexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar = 1.0/scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h new file mode 100644 index 000000000..c0d1e941a --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h @@ -0,0 +1,83 @@ +#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H +#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_8s_aligned16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + complexVal1 = _mm_or_si128(complexVal1, complexVal2); + + complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); + complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); + + complexVal3 = _mm_or_si128(complexVal3, complexVal4); + + + complexVal1 = _mm_srai_epi16(complexVal1, 8); + complexVal3 = _mm_srai_epi16(complexVal3, 8); + + iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); + int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h new file mode 100644 index 000000000..1482ab82e --- /dev/null +++ b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h @@ -0,0 +1,179 @@ +#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H +#define INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_16s_aligned16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, result; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_16s_aligned16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + float inputFloatBuffer[4] __attribute__((aligned(128))); + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue1 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue2 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float scalar = 32786.0; + for(number = 0; number < num_points; number++){ + float real = ((float)(*complexVectorPtr++)) / scalar; + float imag = ((float)(*complexVectorPtr++)) / scalar; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h new file mode 100644 index 000000000..29c58ceb8 --- /dev/null +++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h @@ -0,0 +1,163 @@ +#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H +#define INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_32f_aligned16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + __m128 cplxValue1, cplxValue2, result; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) / scalar; + float val1Imag = (float)(*complexVectorPtr++) / scalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + + __m128 cplxValue1, cplxValue2, result; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) * iScalar; + float val1Imag = (float)(*complexVectorPtr++) * iScalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + float real = ( (float) (*complexVectorPtr++)) * invScalar; + float imag = ( (float) (*complexVectorPtr++)) * invScalar; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h new file mode 100644 index 000000000..698e958e4 --- /dev/null +++ b/volk/include/volk/volk_16u_byteswap_aligned16.h @@ -0,0 +1,65 @@ +#ifndef INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H +#define INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_aligned16_sse2(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + uint16_t* inputPtr = intsToSwap; + __m128i input, left, right, output; + + const unsigned int eighthPoints = num_points / 8; + for(;number < eighthPoints; number++){ + // Load the 16t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + // Do the two shifts + left = _mm_slli_epi16(input, 8); + right = _mm_srli_epi16(input, 8); + // Or the left and right halves together + output = _mm_or_si128(left, right); + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 8; + } + + + // Byteswap any remaining points: + number = eighthPoints*8; + for(; number < num_points; number++){ + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int point; + uint16_t* inputPtr = intsToSwap; + for(point = 0; point < num_points; point++){ + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_accumulator_aligned16.h b/volk/include/volk/volk_32f_accumulator_aligned16.h new file mode 100644 index 000000000..7e395cf50 --- /dev/null +++ b/volk/include/volk/volk_32f_accumulator_aligned16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H +#define INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_32f_accumulator_aligned16_sse(float* result, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + float tempBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + aVal = _mm_load_ps(aPtr); + accumulator = _mm_add_ps(accumulator, aVal); + aPtr += 4; + } + _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container + returnValue = tempBuffer[0]; + returnValue += tempBuffer[1]; + returnValue += tempBuffer[2]; + returnValue += tempBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_32f_accumulator_aligned16_generic(float* result, const float* inputBuffer, unsigned int num_points){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + float returnValue = 0; + + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_add_aligned16.h b/volk/include/volk/volk_32f_add_aligned16.h new file mode 100644 index 000000000..721c60fd6 --- /dev/null +++ b/volk/include/volk/volk_32f_add_aligned16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_VOLK_32f_ADD_ALIGNED16_H +#define INCLUDED_VOLK_32f_ADD_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_add_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_add_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_ADD_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h b/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h new file mode 100644 index 000000000..ff917525f --- /dev/null +++ b/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h @@ -0,0 +1,167 @@ +#ifndef INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H +#define INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_calc_spectral_noise_floor_aligned16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* dataPointsPtr = realDataPoints; + float avgPointsVector[4] __attribute__((aligned(128))); + + __m128 dataPointsVal; + __m128 avgPointsVal = _mm_setzero_ps(); + // Calculate the sum (for mean) for all points + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); + } + + _mm_store_ps(avgPointsVector, avgPointsVal); + + float sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more + const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; + + dataPointsPtr = realDataPoints; // Reset the dataPointsPtr + __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); + __m128 vOnesVector = _mm_set_ps1(1.0); + __m128 vValidBinCount = _mm_setzero_ps(); + avgPointsVal = _mm_setzero_ps(); + __m128 compareMask; + number = 0; + // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + // Identify which items do not exceed the mean amplitude + compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); + + // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude + avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); + + // Count the number of bins which do not exceed the mean amplitude + vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); + } + + // Calculate the mean from the remaining data points + _mm_store_ps(avgPointsVector, avgPointsVal); + + sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + // Calculate the number of valid bins from the remaning count + float validBinCountVector[4] __attribute__((aligned(128))); + _mm_store_ps(validBinCountVector, vValidBinCount); + + float validBinCount = 0; + validBinCount += validBinCountVector[0]; + validBinCount += validBinCountVector[1]; + validBinCount += validBinCountVector[2]; + validBinCount += validBinCountVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(realDataPoints[number] <= meanAmplitude){ + sumMean += realDataPoints[number]; + validBinCount += 1.0; + } + } + + float localNoiseFloorAmplitude = 0; + if(validBinCount > 0.0){ + localNoiseFloorAmplitude = sumMean / validBinCount; + } + else{ + localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... + } + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_calc_spectral_noise_floor_aligned16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + float sumMean = 0.0; + unsigned int number; + // find the sum (for mean), etc + for(number = 0; number < num_points; number++){ + // sum (for mean) + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more) + const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; + + // now throw out any bins higher than the mean + sumMean = 0.0; + unsigned int newNumDataPoints = num_points; + for(number = 0; number < num_points; number++){ + if (realDataPoints[number] <= meanAmplitude) + sumMean += realDataPoints[number]; + else + newNumDataPoints--; + } + + float localNoiseFloorAmplitude = 0.0; + if (newNumDataPoints == 0) // in the odd case that all + localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! + else + localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_16s_aligned16.h b/volk/include/volk/volk_32f_convert_16s_aligned16.h new file mode 100644 index 000000000..7fbabd9c3 --- /dev/null +++ b/volk/include/volk/volk_32f_convert_16s_aligned16.h @@ -0,0 +1,110 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_16s_aligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_16s_aligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_16s_aligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_16s_unaligned16.h b/volk/include/volk/volk_32f_convert_16s_unaligned16.h new file mode 100644 index 000000000..d2bbdf13a --- /dev/null +++ b/volk/include/volk/volk_32f_convert_16s_unaligned16.h @@ -0,0 +1,113 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_16s_unaligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_16s_unaligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_16s_unaligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_32s_aligned16.h b/volk/include/volk/volk_32f_convert_32s_aligned16.h new file mode 100644 index 000000000..011ef5d0e --- /dev/null +++ b/volk/include/volk/volk_32f_convert_32s_aligned16.h @@ -0,0 +1,106 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_32s_aligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_32s_aligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_32s_aligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_32s_unaligned16.h b/volk/include/volk/volk_32f_convert_32s_unaligned16.h new file mode 100644 index 000000000..a6df826c7 --- /dev/null +++ b/volk/include/volk/volk_32f_convert_32s_unaligned16.h @@ -0,0 +1,109 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_32s_unaligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_32s_unaligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_32s_unaligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_aligned16.h b/volk/include/volk/volk_32f_convert_64f_aligned16.h new file mode 100644 index 000000000..91a855813 --- /dev/null +++ b/volk/include/volk/volk_32f_convert_64f_aligned16.h @@ -0,0 +1,70 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_aligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_aligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_unaligned16.h b/volk/include/volk/volk_32f_convert_64f_unaligned16.h new file mode 100644 index 000000000..698e0d446 --- /dev/null +++ b/volk/include/volk/volk_32f_convert_64f_unaligned16.h @@ -0,0 +1,70 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_unaligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_unaligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_8s_aligned16.h b/volk/include/volk/volk_32f_convert_8s_aligned16.h new file mode 100644 index 000000000..b9487b622 --- /dev/null +++ b/volk/include/volk/volk_32f_convert_8s_aligned16.h @@ -0,0 +1,117 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_8s_aligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); + intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_8s_aligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_convert_8s_aligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_8s_unaligned16.h b/volk/include/volk/volk_32f_convert_8s_unaligned16.h new file mode 100644 index 000000000..e986dbc87 --- /dev/null +++ b/volk/include/volk/volk_32f_convert_8s_unaligned16.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H +#define INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_8s_unaligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); + intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_8s_unaligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_convert_8s_unaligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_divide_aligned16.h b/volk/include/volk/volk_32f_divide_aligned16.h new file mode 100644 index 000000000..c00700cd8 --- /dev/null +++ b/volk/include/volk/volk_32f_divide_aligned16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H +#define INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_divide_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_div_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_divide_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_dot_prod_aligned16.h b/volk/include/volk/volk_32f_dot_prod_aligned16.h new file mode 100644 index 000000000..3aee1136a --- /dev/null +++ b/volk/include/volk/volk_32f_dot_prod_aligned16.h @@ -0,0 +1,184 @@ +#ifndef INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H +#define INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32f_dot_prod_aligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE + + +static inline void volk_32f_dot_prod_aligned16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_add_ps(cVal, dotProdVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32f_dot_prod_aligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32f_dot_prod_aligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + + bVal1 = _mm_load_ps(bPtr); bPtr += 4; + bVal2 = _mm_load_ps(bPtr); bPtr += 4; + bVal3 = _mm_load_ps(bPtr); bPtr += 4; + bVal4 = _mm_load_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + float dotProductVector[4] __attribute__((aligned(16))); + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32f_dot_prod_unaligned16.h b/volk/include/volk/volk_32f_dot_prod_unaligned16.h new file mode 100644 index 000000000..bce6aa15f --- /dev/null +++ b/volk/include/volk/volk_32f_dot_prod_unaligned16.h @@ -0,0 +1,184 @@ +#ifndef INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H +#define INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32f_dot_prod_unaligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE + + +static inline void volk_32f_dot_prod_unaligned16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_add_ps(cVal, dotProdVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32f_dot_prod_unaligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32f_dot_prod_unaligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; + + bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + float dotProductVector[4] __attribute__((aligned(16))); + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H*/ diff --git a/volk/include/volk/volk_32f_fm_detect_aligned16.h b/volk/include/volk/volk_32f_fm_detect_aligned16.h new file mode 100644 index 000000000..c82239d74 --- /dev/null +++ b/volk/include/volk/volk_32f_fm_detect_aligned16.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H +#define INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) + \param bound The interval that the input phase data is in, which is used to modulo the differentiation + \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. + \param num_noints The number of real values in the input vector. +*/ +static inline void volk_32f_fm_detect_aligned16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ + if (num_points < 1) { + return; + } + unsigned int number = 1; + unsigned int j = 0; + // num_points-1 keeps Fedora 7's gcc from crashing... + // num_points won't work. :( + const unsigned int quarterPoints = (num_points-1) / 4; + + float* outPtr = outputVector; + const float* inPtr = inputVector; + __m128 upperBound = _mm_set_ps1(bound); + __m128 lowerBound = _mm_set_ps1(-bound); + __m128 next3old1; + __m128 next4; + __m128 boundAdjust; + __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above. + __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below. + // Do the first 4 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + for (; number < quarterPoints; number++) { + // Load data + next3old1 = _mm_loadu_ps((float*) (inPtr-1)); + next4 = _mm_load_ps(inPtr); + inPtr += 4; + // Subtract and store: + next3old1 = _mm_sub_ps(next4, next3old1); + // Bound: + boundAdjust = _mm_cmpgt_ps(next3old1, upperBound); + boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust); + next4 = _mm_cmplt_ps(next3old1, lowerBound); + next4 = _mm_and_ps(next4, negBoundAdjust); + boundAdjust = _mm_or_ps(next4, boundAdjust); + // Make sure we're in the bounding interval: + next3old1 = _mm_add_ps(next3old1, boundAdjust); + _mm_store_ps(outPtr,next3old1); // Store the results back into the output + outPtr += 4; + } + + for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points-1]; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) + \param bound The interval that the input phase data is in, which is used to modulo the differentiation + \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. + \param num_points The number of real values in the input vector. +*/ +static inline void volk_32f_fm_detect_aligned16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ + if (num_points < 1) { + return; + } + unsigned int number = 0; + float* outPtr = outputVector; + const float* inPtr = inputVector; + + // Do the first 1 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + + for (number = 1; number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points-1]; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_index_max_aligned16.h b/volk/include/volk/volk_32f_index_max_aligned16.h new file mode 100644 index 000000000..26322bfa2 --- /dev/null +++ b/volk/include/volk/volk_32f_index_max_aligned16.h @@ -0,0 +1,148 @@ +#ifndef INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H +#define INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include + +static inline void volk_32f_index_max_aligned16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + float maxValuesBuffer[4] __attribute__((aligned(16))); + float maxIndexesBuffer[4] __attribute__((aligned(16))); + + for(;number < quarterPoints; number++){ + + currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + + maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); + maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++){ + if(maxValuesBuffer[number] > max){ + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(src0[number] > max){ + index = number; + max = src0[number]; + } + } + target[0] = (unsigned int)index; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + +#if LV_HAVE_SSE +#include + +static inline void volk_32f_index_max_aligned16_sse(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + float maxValuesBuffer[4] __attribute__((aligned(16))); + float maxIndexesBuffer[4] __attribute__((aligned(16))); + + for(;number < quarterPoints; number++){ + + currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); + + maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++){ + if(maxValuesBuffer[number] > max){ + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(src0[number] > max){ + index = number; + max = src0[number]; + } + } + target[0] = (unsigned int)index; + } +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_GENERIC +static inline void volk_32f_index_max_aligned16_generic(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + float max = src0[0]; + unsigned int index = 0; + + int i = 1; + + for(; i < num_points; ++i) { + + if(src0[i] > max){ + index = i; + max = src0[i]; + } + + } + target[0] = index; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32f_interleave_16sc_aligned16.h b/volk/include/volk/volk_32f_interleave_16sc_aligned16.h new file mode 100644 index 000000000..476946b88 --- /dev/null +++ b/volk/include/volk/volk_32f_interleave_16sc_aligned16.h @@ -0,0 +1,155 @@ +#ifndef INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H +#define INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_interleave_16sc_aligned16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue1, cplxValue2; + __m128i intValue1, intValue2; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); + cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); + cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); + + intValue1 = _mm_cvtps_epi32(cplxValue1); + intValue2 = _mm_cvtps_epi32(cplxValue2); + + intValue1 = _mm_packs_epi32(intValue1, intValue2); + + _mm_store_si128((__m128i*)complexVectorPtr, intValue1); + complexVectorPtr += 8; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_interleave_16sc_aligned16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_interleave_16sc_aligned16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + int16_t* complexVectorPtr = (int16_t*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_interleave_32fc_aligned16.h b/volk/include/volk/volk_32f_interleave_32fc_aligned16.h new file mode 100644 index 000000000..859c6a0ef --- /dev/null +++ b/volk/include/volk/volk_32f_interleave_32fc_aligned16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H +#define INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Interleaves the I & Q vector data into the complex vector + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_interleave_32fc_aligned16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + unsigned int number = 0; + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + const uint64_t quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Interleaves the I & Q vector data into the complex vector. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_interleave_32fc_aligned16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_max_aligned16.h b/volk/include/volk/volk_32f_max_aligned16.h new file mode 100644 index 000000000..96aafb2bf --- /dev/null +++ b/volk/include/volk/volk_32f_max_aligned16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_VOLK_32f_MAX_ALIGNED16_H +#define INCLUDED_VOLK_32f_MAX_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_max_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_max_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_max_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_VOLK_32f_MAX_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_min_aligned16.h b/volk/include/volk/volk_32f_min_aligned16.h new file mode 100644 index 000000000..e247f4213 --- /dev/null +++ b/volk/include/volk/volk_32f_min_aligned16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_VOLK_32f_MIN_ALIGNED16_H +#define INCLUDED_VOLK_32f_MIN_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_min_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_min_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_min_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_VOLK_32f_MIN_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_multiply_aligned16.h b/volk/include/volk/volk_32f_multiply_aligned16.h new file mode 100644 index 000000000..b557580ab --- /dev/null +++ b/volk/include/volk/volk_32f_multiply_aligned16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H +#define INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_multiply_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_multiply_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h new file mode 100644 index 000000000..1aabb1d9d --- /dev/null +++ b/volk/include/volk/volk_32f_normalize_aligned16.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H +#define INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value ) + \param vecBuffer The buffer of values to be vectorized + \param num_points The number of values in vecBuffer + \param scalar The scale value to be applied to each buffer value +*/ +static inline void volk_32f_normalize_aligned16_sse(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + + const float invScalar = 1.0 / scalar; + __m128 vecScalar = _mm_set_ps1(invScalar); + + __m128 input1; + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + + input1 = _mm_load_ps(inputPtr); + + input1 = _mm_mul_ps(input1, vecScalar); + + _mm_store_ps(inputPtr, input1); + + inputPtr += 4; + } + + number = quarterPoints*4; + for(; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Normalizes the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be normalizeed + \param bVector One of the vectors to be normalizeed + \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector +*/ +static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_power_aligned16.h b/volk/include/volk/volk_32f_power_aligned16.h new file mode 100644 index 000000000..2ecd8eecb --- /dev/null +++ b/volk/include/volk/volk_32f_power_aligned16.h @@ -0,0 +1,144 @@ +#ifndef INCLUDED_VOLK_32f_POWER_ALIGNED16_H +#define INCLUDED_VOLK_32f_POWER_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32f_power_aligned16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + +#if LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + __m128 zeroValue = _mm_setzero_ps(); + __m128 signMask; + __m128 negatedValues; + __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); + __m128 onesMask = _mm_set_ps1(1); + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + signMask = _mm_cmplt_ps(aVal, zeroValue); + negatedValues = _mm_sub_ps(zeroValue, aVal); + aVal = _mm_blendv_ps(aVal, negatedValues, signMask); + + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after + cVal = powf4(aVal, vPower); // Takes each input value to the specified power + + cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + for(;number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32f_power_aligned16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + +#if LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + __m128 zeroValue = _mm_setzero_ps(); + __m128 signMask; + __m128 negatedValues; + __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); + __m128 onesMask = _mm_set_ps1(1); + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + signMask = _mm_cmplt_ps(aVal, zeroValue); + negatedValues = _mm_sub_ps(zeroValue, aVal); + aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) ); + + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after + cVal = powf4(aVal, vPower); // Takes each input value to the specified power + + cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + for(;number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector + */ +static inline void volk_32f_power_aligned16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_POWER_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_sqrt_aligned16.h b/volk/include/volk/volk_32f_sqrt_aligned16.h new file mode 100644 index 000000000..0b2eaf251 --- /dev/null +++ b/volk/include/volk/volk_32f_sqrt_aligned16.h @@ -0,0 +1,64 @@ +#ifndef INCLUDED_VOLK_32f_SQRT_ALIGNED16_H +#define INCLUDED_VOLK_32f_SQRT_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_aligned16_sse(float* cVector, const float* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + + cVal = _mm_sqrt_ps(aVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = sqrtf(*aPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_aligned16_generic(float* cVector, const float* aVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = sqrtf(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_SQRT_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_stddev_aligned16.h b/volk/include/volk/volk_32f_stddev_aligned16.h new file mode 100644 index 000000000..1c6a08437 --- /dev/null +++ b/volk/include/volk/volk_32f_stddev_aligned16.h @@ -0,0 +1,144 @@ +#ifndef INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H +#define INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_stddev_aligned16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for(;number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_stddev_aligned16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for(;number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_stddev_aligned16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h b/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h new file mode 100644 index 000000000..1cd502257 --- /dev/null +++ b/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h @@ -0,0 +1,169 @@ +#ifndef INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H +#define INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_aligned16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + float meanBuffer[4] __attribute__((aligned(128))); + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for(;number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x + + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x + + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x + + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_aligned16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + float meanBuffer[4] __attribute__((aligned(128))); + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for(;number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_aligned16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_subtract_aligned16.h b/volk/include/volk/volk_32f_subtract_aligned16.h new file mode 100644 index 000000000..ac3f5e5d1 --- /dev/null +++ b/volk/include/volk/volk_32f_subtract_aligned16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H +#define INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_subtract_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_sub_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_subtract_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_sum_of_poly_aligned16.h b/volk/include/volk/volk_32f_sum_of_poly_aligned16.h new file mode 100644 index 000000000..a326e62b1 --- /dev/null +++ b/volk/include/volk/volk_32f_sum_of_poly_aligned16.h @@ -0,0 +1,151 @@ +#ifndef INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H +#define INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H + +#include +#include +#include + +#ifndef MAX +#define MAX(X,Y) ((X) > (Y)?(X):(Y)) +#endif + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32f_sum_of_poly_aligned16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; + + xmm9 = _mm_setzero_ps(); + xmm1 = _mm_setzero_ps(); + + xmm0 = _mm_load1_ps(¢er_point_array[0]); + xmm6 = _mm_load1_ps(¢er_point_array[1]); + xmm7 = _mm_load1_ps(¢er_point_array[2]); + xmm8 = _mm_load1_ps(¢er_point_array[3]); + //xmm11 = _mm_load1_ps(¢er_point_array[4]); + xmm10 = _mm_load1_ps(cutoff); + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 2) & 3; + int i = 0; + + for(; i < bound; ++i) { + xmm2 = _mm_load_ps(src0); + xmm2 = _mm_max_ps(xmm10, xmm2); + xmm3 = _mm_mul_ps(xmm2, xmm2); + xmm4 = _mm_mul_ps(xmm2, xmm3); + xmm5 = _mm_mul_ps(xmm3, xmm3); + //xmm12 = _mm_mul_ps(xmm3, xmm4); + + xmm2 = _mm_mul_ps(xmm2, xmm0); + xmm3 = _mm_mul_ps(xmm3, xmm6); + xmm4 = _mm_mul_ps(xmm4, xmm7); + xmm5 = _mm_mul_ps(xmm5, xmm8); + //xmm12 = _mm_mul_ps(xmm12, xmm11); + + xmm2 = _mm_add_ps(xmm2, xmm3); + xmm3 = _mm_add_ps(xmm4, xmm5); + + src0 += 4; + + xmm9 = _mm_add_ps(xmm2, xmm9); + + xmm1 = _mm_add_ps(xmm3, xmm1); + + //xmm9 = _mm_add_ps(xmm12, xmm9); + } + + xmm2 = _mm_hadd_ps(xmm9, xmm1); + xmm3 = _mm_hadd_ps(xmm2, xmm2); + xmm4 = _mm_hadd_ps(xmm3, xmm3); + + _mm_store_ss(&result, xmm4); + + + + for(i = 0; i < leftovers; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth);// + + //center_point_array[4] * fith); + } + + result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; + + target[0] = result; +} + + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC + +static inline void volk_32f_sum_of_poly_aligned16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + int i = 0; + + for(; i < num_bytes >> 2; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth); //+ + //center_point_array[4] * fith); + /*printf("%f12...%d\n", (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth) + + //center_point_array[4] * fith) + + (center_point_array[4]), i); + */ + } + + result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); + + + + *target = result; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h new file mode 100644 index 000000000..436656ca0 --- /dev/null +++ b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h @@ -0,0 +1,82 @@ +#ifndef INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H +#define INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies the input complex vector with the input float vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the float values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32f_multiply_aligned16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; + for(;number < quarterPoints; number++){ + + aVal1 = _mm_load_ps((const float*)aPtr); + aPtr += 2; + + aVal2 = _mm_load_ps((const float*)aPtr); + aPtr += 2; + + bVal = _mm_load_ps(bPtr); + bPtr += 4; + + bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0)); + bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2)); + + cVal = _mm_mul_ps(aVal1, bVal1); + + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container + cPtr += 2; + + cVal = _mm_mul_ps(aVal2, bVal2); + + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container + + cPtr += 2; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr); + bPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32f_multiply_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h b/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h new file mode 100644 index 000000000..2d71ee4f8 --- /dev/null +++ b/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h @@ -0,0 +1,109 @@ +#ifndef INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H +#define INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32fc_32f_power_32fc_aligned16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + +#if LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + + __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + cplxValue2 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + // Convert to polar coordinates + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + phase = atan2f4(qValue, iValue); // Calculate the Phase + + magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values + + // Now calculate the power of the polar coordinate data + magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power + + phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power + + // Convert back to cartesian coordinates + iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude + qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude + + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values + + _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container + + cPtr += 2; + + _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container + + cPtr += 2; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + lv_32fc_t complexPower; + ((float*)&complexPower)[0] = power; + ((float*)&complexPower)[1] = 0; + for(;number < num_points; number++){ + *cPtr++ = lv_cpow((*aPtr++), complexPower); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector + */ +static inline void volk_32fc_32f_power_32fc_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + lv_32fc_t complexPower; + ((float*)&complexPower)[0] = power; + ((float*)&complexPower)[1] = 0.0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_cpow((*aPtr++), complexPower); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_atan2_32f_aligned16.h b/volk/include/volk/volk_32fc_atan2_32f_aligned16.h new file mode 100644 index 000000000..df0ebb987 --- /dev/null +++ b/volk/include/volk/volk_32fc_atan2_32f_aligned16.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H +#define INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_atan2_32f_aligned16_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* outPtr = outputVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + const float invNormalizeFactor = 1.0 / normalizeFactor; + +#if LV_HAVE_LIB_SIMDMATH + __m128 testVector = _mm_set_ps1(2*M_PI); + __m128 correctVector = _mm_set_ps1(M_PI); + __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); + __m128 phase; + __m128 complex1, complex2, iValue, qValue; + __m128 keepMask; + + for (; number < quarterPoints; number++) { + // Load IQ data: + complex1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + complex2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + // Deinterleave IQ data: + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + // Arctan to get phase: + phase = atan2f4(qValue, iValue); + // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. + // Compare to 2pi: + keepMask = _mm_cmpneq_ps(phase,testVector); + phase = _mm_blendv_ps(correctVector, phase, keepMask); + // done with above correction. + phase = _mm_mul_ps(phase, vNormalizeFactor); + _mm_store_ps((float*)outPtr, phase); + outPtr += 4; + } + number = quarterPoints * 4; +#endif /* LV_HAVE_SIMDMATH_H */ + + for (; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_SSE +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_atan2_32f_aligned16_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* outPtr = outputVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + const float invNormalizeFactor = 1.0 / normalizeFactor; + +#if LV_HAVE_LIB_SIMDMATH + __m128 testVector = _mm_set_ps1(2*M_PI); + __m128 correctVector = _mm_set_ps1(M_PI); + __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); + __m128 phase; + __m128 complex1, complex2, iValue, qValue; + __m128 mask; + __m128 keepMask; + + for (; number < quarterPoints; number++) { + // Load IQ data: + complex1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + complex2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + // Deinterleave IQ data: + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + // Arctan to get phase: + phase = atan2f4(qValue, iValue); + // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. + // Compare to 2pi: + keepMask = _mm_cmpneq_ps(phase,testVector); + phase = _mm_and_ps(phase, keepMask); + mask = _mm_andnot_ps(keepMask, correctVector); + phase = _mm_or_ps(phase, mask); + // done with above correction. + phase = _mm_mul_ps(phase, vNormalizeFactor); + _mm_store_ps((float*)outPtr, phase); + outPtr += 4; + } + number = quarterPoints * 4; +#endif /* LV_HAVE_SIMDMATH_H */ + + for (; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The vector where the results will be stored. + \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_atan2_32f_aligned16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ + float* outPtr = outputVector; + const float* inPtr = (float*)inputVector; + const float invNormalizeFactor = 1.0 / normalizeFactor; + unsigned int number; + for ( number = 0; number < num_points; number++) { + const float real = *inPtr++; + const float imag = *inPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h b/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h new file mode 100644 index 000000000..60103c1b5 --- /dev/null +++ b/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h @@ -0,0 +1,344 @@ +#ifndef INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H +#define INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H + +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_conjugate_dot_prod_aligned16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + + } + /* + for(i = 0; i < num_bytes >> 3; ++i) { + *result += input[i] * conjf(taps[i]); + } + */ +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_conjugate_dot_prod_aligned16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + + + + asm volatile + ( + "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %[conjugator], %%r9\n\t" + " movq %%rcx, %%rax\n\t" + " movaps 0(%%r9), %%xmm8\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movups 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " xorps %%xmm8, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " xorps %%xmm8, %%xmm3\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + " xorps %%xmm8, %%xmm2\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); + + } + + return; +} +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 +static inline void volk_32fc_conjugate_dot_prod_aligned16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + int bound = num_bytes >> 4; + int leftovers = num_bytes % 16; + + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " #movl 12(%%ebp), %%eax # input\n\t" + " #movl 16(%%ebp), %%edx # taps\n\t" + " #movl 20(%%ebp), %%ecx # n_bytes\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%[eax]), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%[edx]), %%xmm2\n\t" + " movl %[ecx], (%[out])\n\t" + " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" + + " xorps %%xmm1, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%[eax]), %%xmmA\n\t" + "# movaps (%[edx]), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%[edx]), %%xmm3\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " xorps %%xmm1, %%xmm3\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " movaps 16(%[eax]), %%xmm1\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%[eax]), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %[eax]\n\t" + " movaps 32(%[edx]), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " xorps %%xmm1, %%xmm2\n\t" + " addl $32, %[edx]\n\t" + ".%=L1_test:\n\t" + " decl %[ecx]\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" + " shrl $4, %[ecx]\n\t" + " andl $1, %[ecx]\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " #movl 8(%%ebp), %[eax] \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%[out])\n\t" + " movss (%[out]), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %[eax] # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) + ); + + + + + printf("%d, %d\n", leftovers, bound); + + for(; leftovers > 0; leftovers -= 8) { + + + *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + + + +#endif /*INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..02085cd1e --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h new file mode 100644 index 000000000..3d9ebccdd --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h @@ -0,0 +1,78 @@ +#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H +#define INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_aligned16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(qBufferPtr, dVal); + + iBufferPtr += 2; + qBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_aligned16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..3026b2422 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,80 @@ +#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H +#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_16s_aligned16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + iValue = _mm_mul_ps(iValue, vScalar); + + _mm_store_ps(floatBuffer, iValue); + *iBufferPtr++ = (int16_t)(floatBuffer[0]); + *iBufferPtr++ = (int16_t)(floatBuffer[1]); + *iBufferPtr++ = (int16_t)(floatBuffer[2]); + *iBufferPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + iBufferPtr = &iBuffer[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..2af973bcc --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,68 @@ +#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H +#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + float* iBufferPtr = iBuffer; + + __m128 cplxValue1, cplxValue2, iValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h new file mode 100644 index 000000000..f408589c4 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H +#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_64f_aligned16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + iBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_64f_aligned16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_dot_prod_aligned16.h b/volk/include/volk/volk_32fc_dot_prod_aligned16.h new file mode 100644 index 000000000..1a834dc25 --- /dev/null +++ b/volk/include/volk/volk_32fc_dot_prod_aligned16.h @@ -0,0 +1,468 @@ +#ifndef INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H +#define INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H + +#include +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_dot_prod_aligned16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; + + } + +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_dot_prod_aligned16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + asm + ( + "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %%rcx, %%rax\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 + +static inline void volk_32fc_dot_prod_aligned16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " movl 12(%%ebp), %%eax # input\n\t" + " movl 16(%%ebp), %%edx # taps\n\t" + " movl 20(%%ebp), %%ecx # n_bytes\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%eax), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%edx), %%xmm2\n\t" + " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%eax), %%xmmA\n\t" + "# movaps (%%edx), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%eax), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%edx), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%eax), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %%eax\n\t" + " movaps 32(%%edx), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " addl $32, %%edx\n\t" + ".%=L1_test:\n\t" + " decl %%ecx\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" + " shrl $4, %%ecx\n\t" + " andl $1, %%ecx\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " movl 8(%%ebp), %%eax \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%%eax)\n\t" + " movss (%%eax), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %%eax # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : + : "eax", "ecx", "edx" + ); + + + int getem = num_bytes % 16; + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32fc_dot_prod_aligned16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(float)); + + unsigned int number = 0; + const unsigned int halfPoints = num_bytes >> 4; + + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; + + dotProdVal = _mm_setzero_ps(); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + + a += 2; + b += 2; + } + + lv_32fc_t dotProductVector[2] __attribute__((aligned(16))); + + _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + + if((num_bytes >> 2) != 0) { + dotProduct += (*a) * (*b); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32fc_dot_prod_aligned16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + volk_32fc_dot_prod_aligned16_sse3(result, input, taps, num_bytes); + // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now + /* + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; + float *p_input, *p_taps; + __m64 *p_result; + + p_result = (__m64*)result; + p_input = (float*)input; + p_taps = (float*)taps; + + static const __m128i neg = {0x000000000000000080000000}; + + int i = 0; + + int bound = (num_bytes >> 5); + int leftovers = (num_bytes & 24) >> 3; + + real0 = _mm_sub_ps(real0, real0); + real1 = _mm_sub_ps(real1, real1); + im0 = _mm_sub_ps(im0, im0); + im1 = _mm_sub_ps(im1, im1); + + for(; i < bound; ++i) { + + + xmm0 = _mm_load_ps(p_input); + xmm1 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm2 = _mm_load_ps(p_input); + xmm3 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm4 = _mm_unpackhi_ps(xmm0, xmm2); + xmm5 = _mm_unpackhi_ps(xmm1, xmm3); + xmm0 = _mm_unpacklo_ps(xmm0, xmm2); + xmm2 = _mm_unpacklo_ps(xmm1, xmm3); + + //imaginary vector from input + xmm1 = _mm_unpackhi_ps(xmm0, xmm4); + //real vector from input + xmm3 = _mm_unpacklo_ps(xmm0, xmm4); + //imaginary vector from taps + xmm0 = _mm_unpackhi_ps(xmm2, xmm5); + //real vector from taps + xmm2 = _mm_unpacklo_ps(xmm2, xmm5); + + xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); + xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); + + xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); + xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); + + real0 = _mm_add_ps(xmm4, real0); + real1 = _mm_add_ps(xmm5, real1); + im0 = _mm_add_ps(xmm6, im0); + im1 = _mm_add_ps(xmm7, im1); + + } + + + + + real1 = _mm_xor_ps(real1, (__m128)neg); + + + im0 = _mm_add_ps(im0, im1); + real0 = _mm_add_ps(real0, real1); + + im0 = _mm_add_ps(im0, real0); + + _mm_storel_pi(p_result, im0); + + for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { + + *result += input[i] * taps[i]; + } + */ +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_index_max_aligned16.h b/volk/include/volk/volk_32fc_index_max_aligned16.h new file mode 100644 index 000000000..d77a95f90 --- /dev/null +++ b/volk/include/volk/volk_32fc_index_max_aligned16.h @@ -0,0 +1,215 @@ +#ifndef INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H +#define INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H + +#include +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + + +static inline void volk_32fc_index_max_aligned16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { + + + + union bit128 holderf; + union bit128 holderi; + float sq_dist = 0.0; + + + + + union bit128 xmm5, xmm4; + __m128 xmm1, xmm2, xmm3; + __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + + xmm5.int_vec = xmmfive = _mm_setzero_si128(); + xmm4.int_vec = xmmfour = _mm_setzero_si128(); + holderf.int_vec = holder0 = _mm_setzero_si128(); + holderi.int_vec = holder1 = _mm_setzero_si128(); + + + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + + xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! + xmm9 = xmm8 = _mm_setzero_si128(); + xmm10 = _mm_set_epi32(4, 4, 4, 4); + xmm3 = _mm_setzero_ps(); +; + + //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); + + for(; i < bound; ++i) { + + xmm1 = _mm_load_ps((float*)src0); + xmm2 = _mm_load_ps((float*)&src0[2]); + + + src0 += 4; + + + xmm1 = _mm_mul_ps(xmm1, xmm1); + xmm2 = _mm_mul_ps(xmm2, xmm2); + + + xmm1 = _mm_hadd_ps(xmm1, xmm2); + + xmm3 = _mm_max_ps(xmm1, xmm3); + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + + + //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); + + } + + + for(i = 0; i < leftovers0; ++i) { + + + xmm2 = _mm_load_ps((float*)src0); + + xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8); + xmm8 = (__m128i)xmm1; + + xmm2 = _mm_mul_ps(xmm2, xmm2); + + src0 += 2; + + xmm1 = _mm_hadd_ps(xmm2, xmm2); + + xmm3 = _mm_max_ps(xmm1, xmm3); + + xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); + + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + } + + + + + for(i = 0; i < leftovers1; ++i) { + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + + sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); + + xmm2 = _mm_load1_ps(&sq_dist); + + xmm1 = xmm3; + + xmm3 = _mm_max_ss(xmm3, xmm2); + + + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + xmm8 = _mm_shuffle_epi32(xmm8, 0x00); + + xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); + + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + } + + //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); + + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + _mm_store_ps((float*)&(holderf.f), xmm3); + _mm_store_si128(&(holderi.int_vec), xmm9); + + target[0] = holderi.i[0]; + sq_dist = holderf.f[0]; + target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; + sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; + target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; + sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; + target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; + sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; + + + + /* + float placeholder = 0.0; + uint32_t temp0, temp1; + unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); + unsigned int l0 = g0 ^ 1; + + unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); + unsigned int l1 = g1 ^ 1; + + temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; + temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; + sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; + placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; + + g0 = (sq_dist > placeholder); + l0 = g0 ^ 1; + target[0] = g0 * temp0 + l0 * temp1; + */ + +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_index_max_aligned16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { + float sq_dist = 0.0; + float max = 0.0; + unsigned int index = 0; + + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + + sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); + + index = sq_dist > max ? i : index; + max = sq_dist > max ? sq_dist : max; + + + } + target[0] = index; + +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h new file mode 100644 index 000000000..4e64d8c22 --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h @@ -0,0 +1,146 @@ +#ifndef INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H +#define INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_magnitude_16s_aligned16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, result; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_magnitude_16s_aligned16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_magnitude_16s_aligned16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h new file mode 100644 index 000000000..7a8fd1ef9 --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h @@ -0,0 +1,121 @@ +#ifndef INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H +#define INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_aligned16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_aligned16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_multiply_aligned16.h b/volk/include/volk/volk_32fc_multiply_aligned16.h new file mode 100644 index 000000000..6a1649fdb --- /dev/null +++ b/volk/include/volk/volk_32fc_multiply_aligned16.h @@ -0,0 +1,78 @@ +#ifndef INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H +#define INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_multiply_aligned16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_multiply_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h b/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h new file mode 100644 index 000000000..52ec0f95b --- /dev/null +++ b/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h @@ -0,0 +1,134 @@ +#ifndef INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H +#define INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_power_spectral_density_32f_aligned16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + +#if LV_HAVE_LIB_SIMDMATH + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + + __m128 invRBW = _mm_set_ps1(iRBW); + + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the complex values + input1 =_mm_load_ps(inputPtr); + inputPtr += 4; + input2 =_mm_load_ps(inputPtr); + inputPtr += 4; + + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); + + // Divide by the rbw + power = _mm_mul_ps(power, invRBW); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); + + // Store the floating point results + _mm_store_ps(destPtr, power); + + destPtr += 4; + } + + number = quarterPoints*4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); + destPtr++; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_power_spectral_density_32f_aligned16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + unsigned int point; + const float invRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + + for(point = 0; point < num_points; point++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); + + realFFTDataPointsPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h b/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h new file mode 100644 index 000000000..645629b9d --- /dev/null +++ b/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H +#define INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value for each input point + \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param num_points The number of fft data points +*/ +static inline void volk_32fc_power_spectrum_32f_aligned16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iNormalizationFactor = 1.0 / normalizationFactor; +#if LV_HAVE_LIB_SIMDMATH + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the complex values + input1 =_mm_load_ps(inputPtr); + inputPtr += 4; + input2 =_mm_load_ps(inputPtr); + inputPtr += 4; + + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); + + // Store the floating point results + _mm_store_ps(destPtr, power); + + destPtr += 4; + } + + number = quarterPoints*4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); + + destPtr++; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the log10 power value for each input point + \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided agains all the input values before the power is calculated + \param num_points The number of fft data points +*/ +static inline void volk_32fc_power_spectrum_32f_aligned16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + const float iNormalizationFactor = 1.0 / normalizationFactor; + unsigned int point; + for(point = 0; point < num_points; point++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); + + + realFFTDataPointsPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_square_dist_aligned16.h b/volk/include/volk/volk_32fc_square_dist_aligned16.h new file mode 100644 index 000000000..6458ea4dd --- /dev/null +++ b/volk/include/volk/volk_32fc_square_dist_aligned16.h @@ -0,0 +1,112 @@ +#ifndef INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H +#define INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_square_dist_aligned16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + lv_32fc_t diff; + float sq_dist; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + _mm_store_ps(target, xmm4); + + target += 4; + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_square_dist_aligned16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h b/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h new file mode 100644 index 000000000..0fcc86f1e --- /dev/null +++ b/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H +#define INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H + +#include +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_square_dist_scalar_mult_aligned16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + lv_32fc_t diff; + memset(&diff, 0x0, 2*sizeof(float)); + + float sq_dist = 0.0; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm8 = _mm_load1_ps(&scalar); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_square_dist_scalar_mult_aligned16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32s_and_aligned16.h b/volk/include/volk/volk_32s_and_aligned16.h new file mode 100644 index 000000000..e9f1e3a43 --- /dev/null +++ b/volk/include/volk/volk_32s_and_aligned16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_VOLK_32s_AND_ALIGNED16_H +#define INCLUDED_VOLK_32s_AND_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32s_and_aligned16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_and_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] & bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32s_and_aligned16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) & (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32s_AND_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_convert_32f_aligned16.h b/volk/include/volk/volk_32s_convert_32f_aligned16.h new file mode 100644 index 000000000..a407e68bd --- /dev/null +++ b/volk/include/volk/volk_32s_convert_32f_aligned16.h @@ -0,0 +1,73 @@ +#ifndef INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H +#define INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32s_convert_32f_aligned16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_load_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_store_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32s_convert_32f_aligned16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_convert_32f_unaligned16.h b/volk/include/volk/volk_32s_convert_32f_unaligned16.h new file mode 100644 index 000000000..ad7d4eb17 --- /dev/null +++ b/volk/include/volk/volk_32s_convert_32f_unaligned16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H +#define INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32s_convert_32f_unaligned16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32s_convert_32f_unaligned16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_or_aligned16.h b/volk/include/volk/volk_32s_or_aligned16.h new file mode 100644 index 000000000..f4c427c4d --- /dev/null +++ b/volk/include/volk/volk_32s_or_aligned16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_VOLK_32s_OR_ALIGNED16_H +#define INCLUDED_VOLK_32s_OR_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32s_or_aligned16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_or_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] | bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32s_or_aligned16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) | (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32s_OR_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32u_byteswap_aligned16.h b/volk/include/volk/volk_32u_byteswap_aligned16.h new file mode 100644 index 000000000..09173a9d5 --- /dev/null +++ b/volk/include/volk/volk_32u_byteswap_aligned16.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H +#define INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_aligned16_sse2(uint32_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + + uint32_t* inputPtr = intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = quarterPoints*4; + for(; number < num_points; number++){ + uint32_t outputVal = *inputPtr; + outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_aligned16_generic(uint32_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = intsToSwap; + + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32u_popcnt_aligned16.h b/volk/include/volk/volk_32u_popcnt_aligned16.h new file mode 100644 index 000000000..37cfd112c --- /dev/null +++ b/volk/include/volk/volk_32u_popcnt_aligned16.h @@ -0,0 +1,36 @@ +#ifndef INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H +#define INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H + +#include +#include + + +#if LV_HAVE_GENERIC + +static inline void volk_32u_popcnt_aligned16_generic(uint32_t* ret, const uint32_t value) { + + // This is faster than a lookup table + uint32_t retVal = value; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + + *ret = retVal; +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE4_2 + +#include + +static inline void volk_32u_popcnt_aligned16_sse4_2(uint32_t* ret, const uint32_t value) { + *ret = _mm_popcnt_u32(value); +} + +#endif /*LV_HAVE_SSE4_2*/ + +#endif /*INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_64f_convert_32f_aligned16.h b/volk/include/volk/volk_64f_convert_32f_aligned16.h new file mode 100644 index 000000000..44df66104 --- /dev/null +++ b/volk/include/volk/volk_64f_convert_32f_aligned16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H +#define INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_aligned16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_aligned16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_unaligned16.h b/volk/include/volk/volk_64f_convert_32f_unaligned16.h new file mode 100644 index 000000000..08cfb6127 --- /dev/null +++ b/volk/include/volk/volk_64f_convert_32f_unaligned16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H +#define INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_unaligned16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_unaligned16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_64f_max_aligned16.h b/volk/include/volk/volk_64f_max_aligned16.h new file mode 100644 index 000000000..ce4907a8c --- /dev/null +++ b/volk/include/volk/volk_64f_max_aligned16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_VOLK_64f_MAX_ALIGNED16_H +#define INCLUDED_VOLK_64f_MAX_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_max_aligned16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_max_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_max_aligned16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_VOLK_64f_MAX_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64f_min_aligned16.h b/volk/include/volk/volk_64f_min_aligned16.h new file mode 100644 index 000000000..acf4d6b2a --- /dev/null +++ b/volk/include/volk/volk_64f_min_aligned16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_VOLK_64f_MIN_ALIGNED16_H +#define INCLUDED_VOLK_64f_MIN_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_min_aligned16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_min_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_min_aligned16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_VOLK_64f_MIN_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64u_byteswap_aligned16.h b/volk/include/volk/volk_64u_byteswap_aligned16.h new file mode 100644 index 000000000..d5e1b6f30 --- /dev/null +++ b/volk/include/volk/volk_64u_byteswap_aligned16.h @@ -0,0 +1,88 @@ +#ifndef INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H +#define INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_aligned16_sse2(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + uint64_t number = 0; + const unsigned int halfPoints = num_points / 2; + for(;number < halfPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + + // Reorder the two words + output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); + + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = halfPoints*2; + for(; number < num_points; number++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_aligned16_generic(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64u_popcnt_aligned16.h b/volk/include/volk/volk_64u_popcnt_aligned16.h new file mode 100644 index 000000000..e8e1396e7 --- /dev/null +++ b/volk/include/volk/volk_64u_popcnt_aligned16.h @@ -0,0 +1,74 @@ +#ifndef INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H +#define INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H + +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_64u_popcnt_aligned16_generic(uint64_t* ret, const uint64_t value) { + + const uint32_t* valueVector = (const uint32_t*)&value; + + // This is faster than a lookup table + uint32_t retVal = valueVector[0]; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + uint64_t retVal64 = retVal; + + retVal = valueVector[1]; + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + retVal64 += retVal; + + *ret = retVal64; + +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE4_2 + +#include + +static inline void volk_64u_popcnt_aligned16_sse4_2(uint64_t* ret, const uint64_t value) { +#if LV_64 + *ret = _mm_popcnt_u64(value); +#else + const uint32_t* valueVector = (const uint32_t*)&value; + + // This is faster than a lookup table + uint32_t retVal = valueVector[0]; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + uint64_t retVal64 = retVal; + + retVal = valueVector[1]; + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + retVal64 += retVal; + + *ret = retVal64; + +#endif +} + +#endif /*LV_HAVE_SSE4_2*/ + +#endif /*INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_8s_convert_16s_aligned16.h b/volk/include/volk/volk_8s_convert_16s_aligned16.h new file mode 100644 index 000000000..0efe3c6a1 --- /dev/null +++ b/volk/include/volk/volk_8s_convert_16s_aligned16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED16_H +#define INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_convert_16s_aligned16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_convert_16s_aligned16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_16s_unaligned16.h b/volk/include/volk/volk_8s_convert_16s_unaligned16.h new file mode 100644 index 000000000..05b916cea --- /dev/null +++ b/volk/include/volk/volk_8s_convert_16s_unaligned16.h @@ -0,0 +1,73 @@ +#ifndef INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED16_H +#define INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8s_convert_16s_unaligned16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8s_convert_16s_unaligned16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_32f_aligned16.h b/volk/include/volk/volk_8s_convert_32f_aligned16.h new file mode 100644 index 000000000..54b66ef8f --- /dev/null +++ b/volk/include/volk/volk_8s_convert_32f_aligned16.h @@ -0,0 +1,92 @@ +#ifndef INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED16_H +#define INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_convert_32f_aligned16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_convert_32f_aligned16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_32f_unaligned16.h b/volk/include/volk/volk_8s_convert_32f_unaligned16.h new file mode 100644 index 000000000..8019aac9a --- /dev/null +++ b/volk/include/volk/volk_8s_convert_32f_unaligned16.h @@ -0,0 +1,94 @@ +#ifndef INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED16_H +#define INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8s_convert_32f_unaligned16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1( iScalar ); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8s_convert_32f_unaligned16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h new file mode 100644 index 000000000..38eaa49ea --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_16s_aligned16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + __m128i complexVal, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iOutputVal = _mm_cvtepi8_epi16(iOutputVal); + iOutputVal = _mm_slli_epi16(iOutputVal, 8); + + qOutputVal = _mm_cvtepi8_epi16(qOutputVal); + qOutputVal = _mm_slli_epi16(qOutputVal, 8); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..d0c118965 --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h @@ -0,0 +1,164 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_32f_aligned16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue, qFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + iComplexVal = _mm_srli_si128(iComplexVal, 4); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + qBufferPtr += 4; + + qComplexVal = _mm_srli_si128(qComplexVal, 4); + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + + qBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + float floatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int8_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..d0cb49494 --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_16s_aligned16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i complexVal, outputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + outputVal = _mm_cvtepi8_epi16(complexVal); + outputVal = _mm_slli_epi16(outputVal, 7); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..c849448ea --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,133 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_32f_aligned16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + + complexVal = _mm_srli_si128(complexVal, 4); + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h new file mode 100644 index 000000000..d84d64568 --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_8s_aligned16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, outputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); + + outputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h b/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h new file mode 100644 index 000000000..470a67539 --- /dev/null +++ b/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h @@ -0,0 +1,102 @@ +#ifndef INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H +#define INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_multiply_conjugate_16sc_aligned16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + lv_16sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + const int shuffleMask = _MM_SHUFFLE(2,3,0,1); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + int16_t* c16Ptr = (int16_t*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_multiply_conjugate_16sc_aligned16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + int16_t* c16Ptr = (int16_t*)cVector; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number =0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h b/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h new file mode 100644 index 000000000..52b444cf7 --- /dev/null +++ b/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H +#define INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_multiply_conjugate_32fc_aligned16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + __m128 ret; + lv_32fc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + const int shuffleMask = _MM_SHUFFLE(2,3,0,1); + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + a += 4; + b += 4; + } + + number = quarterPoints * 4; + float* cFloatPtr = (float*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cFloatPtr++ = lv_creal(temp) / scalar; + *cFloatPtr++ = lv_cimag(temp) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_multiply_conjugate_32fc_aligned16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* cPtr = (float*)cVector; + const float invScalar = 1.0 / scalar; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number = 0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cPtr++ = (lv_creal(temp) * invScalar); + *cPtr++ = (lv_cimag(temp) * invScalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H */ diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h new file mode 100644 index 000000000..6f444ad89 --- /dev/null +++ b/volk/include/volk/volk_common.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_LIBVECTOR_COMMON_H +#define INCLUDED_LIBVECTOR_COMMON_H + +#include +#if LV_HAVE_MMX +#include +union bit128{ + uint16_t i16[8]; + uint32_t i[4]; + float f[4]; + double d[2]; + __m128i int_vec; + __m128 float_vec; + __m128d double_vec; +}; +#endif /*LV_HAVE_MMX*/ + +#endif /*INCLUDED_LIBVECTOR_COMMON_H*/ diff --git a/volk/include/volk/volk_complex.h b/volk/include/volk/volk_complex.h new file mode 100644 index 000000000..b20b5cf47 --- /dev/null +++ b/volk/include/volk/volk_complex.h @@ -0,0 +1,71 @@ +#ifndef INCLUDE_VOLK_COMPLEX_H +#define INCLUDE_VOLK_COMPLEX_H + +/*! + \brief This header file is to prevent issues with having and variables in the same code as the gcc compiler does not allow that +*/ +#ifdef __cplusplus + +#include +#include + +typedef std::complex lv_8sc_t; +typedef std::complex lv_16sc_t; +typedef std::complex lv_32sc_t; +typedef std::complex lv_32fc_t; +typedef std::complex lv_64fc_t; + +static inline float lv_creal(const lv_32fc_t x){ + return x.real(); +} + +static inline float lv_cimag(const lv_32fc_t x){ + return x.imag(); +} + +static inline lv_32fc_t lv_conj(const lv_32fc_t x){ + return std::conj(x); +} + +static inline lv_32fc_t lv_cpow(const lv_32fc_t x, const lv_32fc_t y){ + return std::pow(x, y); +} + +static inline lv_32fc_t lv_32fc_init(const float x, const float y){ + return std::complex(x,y); +} + +#else + +#include + +typedef char complex lv_8sc_t; +typedef short complex lv_16sc_t; +typedef int complex lv_32sc_t; +typedef float complex lv_32fc_t; +typedef double complex lv_64fc_t; + +static inline float lv_creal(const lv_32fc_t x){ + return creal(x); +} + +static inline float lv_cimag(const lv_32fc_t x){ + return cimag(x); +} + +static inline lv_32fc_t lv_conj(const lv_32fc_t x){ + return conj(x); +} + +static inline lv_32fc_t lv_cpow(const lv_32fc_t x, const lv_32fc_t y){ + return cpow(x, y); +} + +static inline lv_32fc_t lv_32fc_init(const float x, const float y){ + return x + I*y; +} + +#endif + + +#endif /* INCLUDE_VOLK_COMPLEX_H */ diff --git a/volk/include/volk/volk_regexp.py b/volk/include/volk/volk_regexp.py new file mode 100644 index 000000000..7b695cb3b --- /dev/null +++ b/volk/include/volk/volk_regexp.py @@ -0,0 +1,8 @@ +import re + +remove_after_underscore = re.compile("_.*"); +space_remove = re.compile(" "); +leading_space_remove = re.compile("^ *"); +replace_arch = re.compile(", const char\* arch"); +replace_bracket = re.compile(" {"); +replace_volk = re.compile("volk"); diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py new file mode 100755 index 000000000..d00501a93 --- /dev/null +++ b/volk/include/volk/volk_register.py @@ -0,0 +1,279 @@ +#! /usr/bin/python + +import sys +import re +import string +from xml.dom import minidom +from volk_regexp import * +from make_cpuid_x86_c import make_cpuid_x86_c +from make_cpuid_h import make_cpuid_h +from make_proccpu_sim import make_proccpu_sim +from make_set_simd import make_set_simd +from make_cpuid_generic_c import make_cpuid_generic_c +from make_cpuid_powerpc_c import make_cpuid_powerpc_c +from make_registry import make_registry +from make_h import make_h +from make_init_h import make_init_h +from make_config_fixed import make_config_fixed +from make_config_in import make_config_in +from make_c import make_c +from make_runtime_c import make_runtime_c +from make_init_c import make_init_c +from make_runtime import make_runtime +from make_typedefs import make_typedefs +from make_mktables import make_mktables +from make_environment_init_c import make_environment_init_c +from make_environment_init_h import make_environment_init_h + +outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w"); +outfile_reg = open("volk_registry.h", "w"); +outfile_h = open("volk.h", "w"); +outfile_c = open("../../lib/volk.c", "w"); +outfile_runtime = open("volk_runtime.h", "w"); +outfile_runtime_c = open("../../lib/volk_runtime.c", "w"); +outfile_typedefs = open("volk_typedefs.h", "w"); +outfile_init_h = open("../../lib/volk_init.h", "w"); +outfile_init_c = open("../../lib/volk_init.c", "w"); +outfile_cpu_h = open("volk_cpu.h", "w"); +outfile_cpu_x86_c = open("../../lib/volk_cpu_x86.c", "w"); +outfile_cpu_generic_c = open("../../lib/volk_cpu_generic.c", "w"); +outfile_cpu_powerpc_c = open("../../lib/volk_cpu_powerpc.c", "w"); +outfile_proccpu_sim = open("../../lib/volk_proccpu_sim.c", "w"); +outfile_config_in = open("../../volk_config.h.in", "w"); +outfile_config_fixed = open("volk_config_fixed.h", "w"); +outfile_mktables = open("../../lib/volk_mktables.c", "w"); +outfile_environment_c = open("../../lib/volk_environment_init.c", "w"); +outfile_environment_h = open("volk_environment_init.h", "w"); +infile = open("Makefile.am", "r"); + + +mfile = infile.readlines(); + +datatypes = []; +functions = []; + + + +for line in mfile: + subline = re.search(".*(aligned).*", line); + if subline: + subsubline = re.search("(?<=volk_).*", subline.group(0)); + if subsubline: + dtype = remove_after_underscore.sub("", subsubline.group(0)); + subdtype = re.search("[0-9]+[A-z]+", dtype); + if subdtype: + datatypes.append(subdtype.group(0)); + + +datatypes = set(datatypes); + +for line in mfile: + for dt in datatypes: + if dt in line: + subline = re.search("(volk_" + dt +"_.*(aligned).*\.h)", line); + if subline: + + subsubline = re.search(".+(?=\.h)", subline.group(0)); + + functions.append(subsubline.group(0)); + +archs = []; +afile = minidom.parse("archs.xml"); +filearchs = afile.getElementsByTagName("arch"); +for filearch in filearchs: + archs.append(str(filearch.attributes["name"].value)); + +for arch in archs: + a_var = re.search("^\$", arch); + if a_var: + archs.remove(arch); + + + +archs_or = "(" +for arch in archs: + archs_or = archs_or + string.upper(arch) + "|"; +archs_or = archs_or[0:len(archs_or)-1]; +archs_or = archs_or + ")"; + + +taglist = []; +fcountlist = []; +arched_arglist = []; +retlist = []; +my_arglist = []; +my_argtypelist = []; +for func in functions: + tags = []; + fcount = []; + infile_source = open(func + ".h"); + begun_name = 0; + begun_paren = 0; + sourcefile = infile_source.readlines(); + infile_source.close(); + for line in sourcefile: + + archline = re.search("^\#if.*?LV_HAVE_" + archs_or + ".*", line); + if archline: + arch = archline.group(0); + archline = re.findall(archs_or + "(?=( |\n|&))", line); + if archline: + archsublist = []; + for tup in archline: + archsublist.append(tup[0]); + fcount.append(archsublist); + testline = re.search("static inline.*?" + func, line); + if (not testline): + continue + tagline = re.search(func + "_.+", line); + if tagline: + tag = re.search("(?<=" + func + "_)\w+(?= *\()",line); + if tag: + tag = re.search("\w+", tag.group(0)); + if tag: + tags.append(tag.group(0)); + + + if begun_name == 0: + retline = re.search(".+(?=" + func + ")", line); + if retline: + ret = retline.group(0); + + + + + subline = re.search(func + ".*", line); + if subline: + subsubline = re.search("\(.*?\)", subline.group(0)); + if subsubline: + args = subsubline.group(0); + + else: + begun_name = 1; + subsubline = re.search("\(.*", subline.group(0)); + if subsubline: + args = subsubline.group(0); + begun_paren = 1; + else: + if begun_paren == 1: + subline = re.search(".*?\)", line); + if subline: + args = args + subline.group(0); + begun_name = 0; + begun_paren = 0; + else: + subline = re.search(".*", line); + args = args + subline.group(0); + else: + subline = re.search("\(.*?\)", line); + if subline: + args = subline.group(0); + begun_name = 0; + else: + subline = re.search("\(.*", line); + if subline: + args = subline.group(0); + begun_paren = 1; + + replace = re.compile("static "); + ret = replace.sub("", ret); + replace = re.compile("inline "); + ret = replace.sub("", ret); + replace = re.compile("\)"); + arched_args = replace.sub(", const char* arch) {", args); + + remove = re.compile('\)|\(|{'); + rargs = remove.sub("", args); + sargs = rargs.split(','); + + + + margs = []; + atypes = []; + for arg in sargs: + temp = arg.split(" "); + margs.append(temp[-1]); + replace = re.compile(" " + temp[-1]); + atypes.append(replace.sub("", arg)); + + + my_args = "" + arg_types = "" + for arg in range(0, len(margs) - 1): + this_arg = leading_space_remove.sub("", margs[arg]); + my_args = my_args + this_arg + ", "; + this_type = leading_space_remove.sub("", atypes[arg]); + arg_types = arg_types + this_type + ", "; + + this_arg = leading_space_remove.sub("", margs[-1]); + my_args = my_args + this_arg; + this_type = leading_space_remove.sub("", atypes[-1]); + arg_types = arg_types + this_type; + my_argtypelist.append(arg_types); + + if(ret[-1] != ' '): + ret = ret + ' '; + + arched_arglist.append(arched_args); #!!!!!!!!!!! + my_arglist.append(my_args) #!!!!!!!!!!!!!!!!! + retlist.append(ret); + fcountlist.append(fcount); + taglist.append(tags); + +outfile_mktables.write(make_mktables(functions)); +outfile_mktables.close(); + + +outfile_cpu_h.write(make_cpuid_h(filearchs)); +outfile_cpu_h.close(); + +outfile_cpu_x86_c.write(make_cpuid_x86_c(filearchs)); +outfile_cpu_x86_c.close(); + +outfile_proccpu_sim.write(make_proccpu_sim(filearchs)); +outfile_proccpu_sim.close(); + +outfile_set_simd.write(make_set_simd(filearchs)); +outfile_set_simd.close(); + +outfile_cpu_generic_c.write(make_cpuid_generic_c(filearchs)); +outfile_cpu_generic_c.close(); + +outfile_cpu_powerpc_c.write(make_cpuid_powerpc_c(filearchs)); +outfile_cpu_powerpc_c.close(); + +outfile_config_in.write(make_config_in(filearchs)); +outfile_config_in.close(); + +outfile_reg.write(make_registry(filearchs, functions, fcountlist)); +outfile_reg.close(); + +outfile_h.write(make_h(functions, arched_arglist, retlist)); +outfile_h.close(); + +outfile_init_h.write(make_init_h(functions, arched_arglist, retlist)); +outfile_init_h.close(); + +outfile_config_fixed.write(make_config_fixed(filearchs)); +outfile_config_fixed.close(); + +outfile_c.write( make_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist)); +outfile_c.close(); + +outfile_runtime_c.write(make_runtime_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist)); +outfile_runtime_c.close(); + +outfile_init_c.write(make_init_c(functions, filearchs)); +outfile_init_c.close(); + +outfile_runtime.write(make_runtime(functions)); +outfile_runtime.close(); + +outfile_typedefs.write(make_typedefs(functions, retlist, my_argtypelist)); +outfile_typedefs.close(); + +outfile_environment_c.write(make_environment_init_c(filearchs)); +outfile_environment_c.close(); + +outfile_environment_h.write(make_environment_init_h()); +outfile_environment_h.close(); diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am new file mode 100644 index 000000000..97eb75680 --- /dev/null +++ b/volk/lib/Makefile.am @@ -0,0 +1,361 @@ +# +# Copyright 2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +include $(top_srcdir)/Makefile.common + +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) $(LV_CXXFLAGS) + + +# We build 2 libraries and 1 executable here. One library contains +# everything except the libcppunit QA code, and one contains only the +# libcppunit-based QA code. The C++ QA code is especially recommended +# when you have general purpose C or C++ code that may not get +# thoroughly exercised by building and running a GR block. The +# executable runs the QA code at "make check" time. +# +# N.B., If there's a SWIG generated shared library and associated +# python code, it will be contained in ../python, not here. (That +# code is conditionally built depending on the state of the +# --without-python configure option.) However, the .i should be here +# next to the .h that it's based on. + + +# list of programs run by "make check" and "make distcheck" +TESTS = test_all + + +lib_LTLIBRARIES = \ + libvolk.la \ + libvolk_runtime.la \ + libvolk_qa.la + + +# ---------------------------------------------------------------- +# The main library +# ---------------------------------------------------------------- + +universal_runtime_CODE = \ + volk_runtime.c \ + volk_init.c \ + volk_rank_archs.c + +universal_CODE = \ + volk.c \ + volk_environment_init.c + +generic_CODE = \ + volk_cpu_generic.cc + +x86_CODE = \ + volk_cpu_x86.c + +x86_SUBCODE = \ + cpuid_x86.S + +x86_64_SUBCODE = \ + cpuid_x86_64.S + +powerpc_CODE = \ + volk_cpu_powerpc.cc + + +if MD_CPU_generic +libvolk_la_SOURCES = \ + $(generic_CODE) \ + $(universal_CODE) +libvolk_runtime_la_SOURCES = \ + $(generic_CODE) \ + $(universal_runtime_CODE) + +endif + +if MD_CPU_x86 +if MD_SUBCPU_x86_64 +libvolk_la_SOURCES = \ + $(x86_CODE) \ + $(x86_64_SUBCODE) \ + $(universal_CODE) + +libvolk_runtime_la_SOURCES = \ + $(x86_CODE) \ + $(x86_64_SUBCODE) \ + $(universal_runtime_CODE) +else +libvolk_la_SOURCES = \ + $(x86_CODE) \ + $(x86_SUBCODE) \ + $(universal_CODE) + +libvolk_runtime_la_SOURCES = \ + $(x86_CODE) \ + $(x86_SUBCODE) \ + $(universal_runtime_CODE) +endif +endif + + +if MD_CPU_powerpc +libvolk_la_SOURCES = \ + $(powerpc_CODE) \ + $(universal_CODE) + +libvolk_runtime_la_SOURCES = \ + $(powerpc_CODE) \ + $(universal_runtime_CODE) +endif + + + +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 +libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 + +libvolk_la_LIBADD = + + + +# ---------------------------------------------------------------- +# The QA library. Note libvolk.la in LIBADD +# ---------------------------------------------------------------- +libvolk_qa_la_SOURCES = \ + qa_volk.cc \ + qa_16s_quad_max_star_aligned16.cc \ + qa_32fc_dot_prod_aligned16.cc \ + qa_32fc_square_dist_aligned16.cc \ + qa_32fc_square_dist_scalar_mult_aligned16.cc \ + qa_32f_sum_of_poly_aligned16.cc \ + qa_32fc_index_max_aligned16.cc \ + qa_32f_index_max_aligned16.cc \ + qa_32fc_conjugate_dot_prod_aligned16.cc \ + qa_16s_permute_and_scalar_add_aligned16.cc \ + qa_16s_branch_4_state_8_aligned16.cc \ + qa_16s_max_star_horizontal_aligned16.cc \ + qa_16s_max_star_aligned16.cc \ + qa_16s_add_quad_aligned16.cc \ + qa_32f_add_aligned16.cc \ + qa_32f_subtract_aligned16.cc \ + qa_32f_max_aligned16.cc \ + qa_32f_min_aligned16.cc \ + qa_64f_max_aligned16.cc \ + qa_64f_min_aligned16.cc \ + qa_32s_and_aligned16.cc \ + qa_32s_or_aligned16.cc \ + qa_32f_dot_prod_aligned16.cc \ + qa_32f_dot_prod_unaligned16.cc \ + qa_32f_fm_detect_aligned16.cc \ + qa_32fc_32f_multiply_aligned16.cc \ + qa_32fc_multiply_aligned16.cc \ + qa_32f_divide_aligned16.cc \ + qa_32f_multiply_aligned16.cc \ + qa_32f_sqrt_aligned16.cc \ + qa_8sc_multiply_conjugate_16sc_aligned16.cc \ + qa_8sc_multiply_conjugate_32fc_aligned16.cc \ + qa_32u_popcnt_aligned16.cc \ + qa_64u_popcnt_aligned16.cc \ + qa_64u_byteswap_aligned16.cc \ + qa_8sc_deinterleave_32f_aligned16.cc \ + qa_16sc_deinterleave_32f_aligned16.cc \ + qa_8sc_deinterleave_16s_aligned16.cc \ + qa_32f_interleave_32fc_aligned16.cc \ + qa_16u_byteswap_aligned16.cc \ + qa_16sc_deinterleave_16s_aligned16.cc \ + qa_32fc_deinterleave_real_32f_aligned16.cc \ + qa_32fc_magnitude_32f_aligned16.cc \ + qa_32fc_deinterleave_real_64f_aligned16.cc \ + qa_32fc_deinterleave_real_16s_aligned16.cc \ + qa_32fc_magnitude_16s_aligned16.cc \ + qa_32fc_deinterleave_32f_aligned16.cc \ + qa_8sc_deinterleave_real_8s_aligned16.cc \ + qa_32fc_deinterleave_64f_aligned16.cc \ + qa_32f_interleave_16sc_aligned16.cc \ + qa_16sc_deinterleave_real_8s_aligned16.cc \ + qa_16sc_deinterleave_real_32f_aligned16.cc \ + qa_16sc_magnitude_32f_aligned16.cc \ + qa_32u_byteswap_aligned16.cc \ + qa_16sc_deinterleave_real_16s_aligned16.cc \ + qa_8sc_deinterleave_real_32f_aligned16.cc \ + qa_16sc_magnitude_16s_aligned16.cc \ + qa_32f_normalize_aligned16.cc \ + qa_8sc_deinterleave_real_16s_aligned16.cc \ + qa_16s_convert_32f_aligned16.cc \ + qa_16s_convert_32f_unaligned16.cc \ + qa_16s_convert_8s_aligned16.cc \ + qa_16s_convert_8s_unaligned16.cc \ + qa_32f_convert_16s_aligned16.cc \ + qa_32f_convert_16s_unaligned16.cc \ + qa_32f_convert_32s_aligned16.cc \ + qa_32f_convert_32s_unaligned16.cc \ + qa_32f_convert_64f_aligned16.cc \ + qa_32f_convert_64f_unaligned16.cc \ + qa_32f_convert_8s_aligned16.cc \ + qa_32f_convert_8s_unaligned16.cc \ + qa_32s_convert_32f_aligned16.cc \ + qa_32s_convert_32f_unaligned16.cc \ + qa_64f_convert_32f_aligned16.cc \ + qa_64f_convert_32f_unaligned16.cc \ + qa_8s_convert_16s_aligned16.cc \ + qa_8s_convert_16s_unaligned16.cc \ + qa_8s_convert_32f_aligned16.cc \ + qa_8s_convert_32f_unaligned16.cc \ + qa_32fc_32f_power_32fc_aligned16.cc \ + qa_32f_power_aligned16.cc \ + qa_32fc_atan2_32f_aligned16.cc \ + qa_32fc_power_spectral_density_32f_aligned16.cc \ + qa_32fc_power_spectrum_32f_aligned16.cc \ + qa_32f_calc_spectral_noise_floor_aligned16.cc \ + qa_32f_accumulator_aligned16.cc \ + qa_32f_stddev_aligned16.cc \ + qa_32f_stddev_and_mean_aligned16.cc + +libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 + +libvolk_qa_la_LIBADD = \ + libvolk.la \ + libvolk_runtime.la \ + $(CPPUNIT_LIBS) + +# ---------------------------------------------------------------- +# headers that don't get installed +# ---------------------------------------------------------------- +noinst_HEADERS = \ + volk_init.h \ + qa_volk.h \ + qa_16s_quad_max_star_aligned16.h \ + qa_32fc_dot_prod_aligned16.h \ + qa_32fc_square_dist_aligned16.h \ + qa_32fc_square_dist_scalar_mult_aligned16.h \ + qa_32f_sum_of_poly_aligned16.h \ + qa_32fc_index_max_aligned16.h \ + qa_32f_index_max_aligned16.h \ + qa_32fc_conjugate_dot_prod_aligned16.h \ + qa_16s_permute_and_scalar_add_aligned16.h \ + qa_16s_branch_4_state_8_aligned16.h \ + qa_16s_max_star_horizontal_aligned16.h \ + qa_16s_max_star_aligned16.h \ + qa_16s_add_quad_aligned16.h \ + qa_32f_add_aligned16.h \ + qa_32f_subtract_aligned16.h \ + qa_32f_max_aligned16.h \ + qa_32f_min_aligned16.h \ + qa_64f_max_aligned16.h \ + qa_64f_min_aligned16.h \ + qa_32s_and_aligned16.h \ + qa_32s_or_aligned16.h \ + qa_32f_dot_prod_aligned16.h \ + qa_32f_dot_prod_unaligned16.h \ + qa_32f_fm_detect_aligned16.h \ + qa_32fc_32f_multiply_aligned16.h \ + qa_32fc_multiply_aligned16.h \ + qa_32f_divide_aligned16.h \ + qa_32f_multiply_aligned16.h \ + qa_32f_sqrt_aligned16.h \ + qa_8sc_multiply_conjugate_16sc_aligned16.h \ + qa_8sc_multiply_conjugate_32fc_aligned16.h \ + qa_32u_popcnt_aligned16.h \ + qa_64u_popcnt_aligned16.h \ + qa_64u_byteswap_aligned16.h \ + qa_8sc_deinterleave_32f_aligned16.h \ + qa_16sc_deinterleave_32f_aligned16.h \ + qa_8sc_deinterleave_16s_aligned16.h \ + qa_32f_interleave_32fc_aligned16.h \ + qa_16u_byteswap_aligned16.h \ + qa_16sc_deinterleave_16s_aligned16.h \ + qa_32fc_deinterleave_real_32f_aligned16.h \ + qa_32fc_magnitude_32f_aligned16.h \ + qa_32fc_deinterleave_real_64f_aligned16.h \ + qa_32fc_deinterleave_real_16s_aligned16.h \ + qa_32fc_magnitude_16s_aligned16.h \ + qa_32fc_deinterleave_32f_aligned16.h \ + qa_8sc_deinterleave_real_8s_aligned16.h \ + qa_32fc_deinterleave_64f_aligned16.h \ + qa_32f_interleave_16sc_aligned16.h \ + qa_16sc_deinterleave_real_8s_aligned16.h \ + qa_16sc_deinterleave_real_32f_aligned16.h \ + qa_16sc_magnitude_32f_aligned16.h \ + qa_32u_byteswap_aligned16.h \ + qa_16sc_deinterleave_real_16s_aligned16.h \ + qa_8sc_deinterleave_real_32f_aligned16.h \ + qa_16sc_magnitude_16s_aligned16.h \ + qa_32f_normalize_aligned16.h \ + qa_8sc_deinterleave_real_16s_aligned16.h \ + qa_16s_convert_32f_aligned16.h \ + qa_16s_convert_32f_unaligned16.h \ + qa_16s_convert_8s_aligned16.h \ + qa_16s_convert_8s_unaligned16.h \ + qa_32f_convert_16s_aligned16.h \ + qa_32f_convert_16s_unaligned16.h \ + qa_32f_convert_32s_aligned16.h \ + qa_32f_convert_32s_unaligned16.h \ + qa_32f_convert_64f_aligned16.h \ + qa_32f_convert_64f_unaligned16.h \ + qa_32f_convert_8s_aligned16.h \ + qa_32f_convert_8s_unaligned16.h \ + qa_32s_convert_32f_aligned16.h \ + qa_32s_convert_32f_unaligned16.h \ + qa_64f_convert_32f_aligned16.h \ + qa_64f_convert_32f_unaligned16.h \ + qa_8s_convert_16s_aligned16.h \ + qa_8s_convert_16s_unaligned16.h \ + qa_8s_convert_32f_aligned16.h \ + qa_8s_convert_32f_unaligned16.h \ + qa_32fc_32f_power_32fc_aligned16.h \ + qa_32f_power_aligned16.h \ + qa_32fc_atan2_32f_aligned16.h \ + qa_32fc_power_spectral_density_32f_aligned16.h \ + qa_32fc_power_spectrum_32f_aligned16.h \ + qa_32f_calc_spectral_noise_floor_aligned16.h \ + qa_32f_accumulator_aligned16.h \ + qa_32f_stddev_aligned16.h \ + qa_32f_stddev_and_mean_aligned16.h + + +# ---------------------------------------------------------------- +# Our test program +# ---------------------------------------------------------------- +noinst_PROGRAMS = \ + test_all + +test_all_SOURCES = test_all.cc +test_all_LDADD = libvolk_qa.la + + +distclean-local: + rm -f volk.c + rm -f volk_cpu_generic.c + rm -f volk_cpu_powerpc.c + rm -f volk_cpu_x86.c + rm -f volk_init.c + rm -f volk_init.h + rm -f volk_mktables + rm -f volk_mktables.c + rm -f volk_proccpu_sim.c + rm -f volk_runtime.c + rm -f volk_tables.h + rm -f volk_environment_init.c +#SUBDIRS = + +#ifdef BUILD_SSE +#SUBDIRS += sse +#elif BUILD_SPU +#SUBDIRS += spu +#else +#SUBDIRS += port +#endif + + diff --git a/volk/lib/assembly.h b/volk/lib/assembly.h new file mode 100644 index 000000000..8a99aa07c --- /dev/null +++ b/volk/lib/assembly.h @@ -0,0 +1,67 @@ +/* -*- c++ -*- */ +/* + * Copyright 2002 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef _ASSEMBLY_H_ +#define _ASSEMBLY_H_ + +#if defined (__APPLE__) && defined (__APPLE_CC__) + +// XCode ignores the .scl and .type functions in XCode 2.2.1 and 2.3, +// but creates an error in XCode 2.4. Just ignore them. + +#define GLOB_SYMB(f) _ ## f + +#define DEF_FUNC_HEAD(f) /* none */ + +#define FUNC_TAIL(f) /* none*/ + +#elif !defined (__ELF__) + +/* + * Too bad, the following define does not work as expected --SF + * #define GLOB_SYMB(f) __USER_LABEL_PREFIX__ ## f + */ +#define GLOB_SYMB(f) _ ## f + +#define DEF_FUNC_HEAD(f) \ + .def GLOB_SYMB(f); .scl 2; .type 32; .endef + +#define FUNC_TAIL(f) /* none */ + + +#else /* !__ELF__ */ + + +#define GLOB_SYMB(f) f + +#define DEF_FUNC_HEAD(f) \ + .type GLOB_SYMB(f),@function \ + +#define FUNC_TAIL(f) \ + .Lfe1: \ + .size GLOB_SYMB(f),.Lfe1-GLOB_SYMB(f) + + +#endif /* !__ELF__ */ + + +#endif /* _ASSEMBLY_H_ */ diff --git a/volk/lib/cpuid_x86.S b/volk/lib/cpuid_x86.S new file mode 100644 index 000000000..4e1a9404f --- /dev/null +++ b/volk/lib/cpuid_x86.S @@ -0,0 +1,60 @@ +# +# Copyright 2003 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +# +# execute CPUID instruction, return EAX, EBX, ECX and EDX values in result +# +# void cpuid_x86 (unsigned int op, unsigned int result[4]); +# + +#include "assembly.h" + +.file "cpuid_x86.S" + .version "01.01" +.text +.globl GLOB_SYMB(cpuid_x86) + DEF_FUNC_HEAD(cpuid_x86) +GLOB_SYMB(cpuid_x86): + pushl %ebp + movl %esp, %ebp + pushl %ebx # must save in PIC mode, holds GOT pointer + pushl %esi + + movl 8(%ebp), %eax # op + movl 12(%ebp), %esi # result + cpuid + movl %eax, 0(%esi) + movl %ebx, 4(%esi) + movl %ecx, 8(%esi) + movl %edx, 12(%esi) + + popl %esi + popl %ebx + popl %ebp + ret + +FUNC_TAIL(cpuid_x86) + .ident "Hand coded cpuid assembly" + + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/volk/lib/cpuid_x86_64.S b/volk/lib/cpuid_x86_64.S new file mode 100644 index 000000000..32b1847cd --- /dev/null +++ b/volk/lib/cpuid_x86_64.S @@ -0,0 +1,54 @@ +# +# Copyright 2003,2005 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +# +# execute CPUID instruction, return EAX, EBX, ECX and EDX values in result +# +# void cpuid_x86 (unsigned int op, unsigned int result[4]); +# + +#include "assembly.h" + +.file "cpuid_x86_64.S" + .version "01.01" +.text +.globl GLOB_SYMB(cpuid_x86) + DEF_FUNC_HEAD(cpuid_x86) +GLOB_SYMB(cpuid_x86): + mov %rbx, %r11 # must save in PIC mode, holds GOT pointer + + mov %rdi, %rax # op + cpuid + movl %eax, 0(%rsi) # result + movl %ebx, 4(%rsi) + movl %ecx, 8(%rsi) + movl %edx, 12(%rsi) + + mov %r11, %rbx + retq + +FUNC_TAIL(cpuid_x86) + .ident "Hand coded cpuid64 assembly" + + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc new file mode 100644 index 000000000..c3005c1be --- /dev/null +++ b/volk/lib/qa_16s_add_quad_aligned16.cc @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_add_quad_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + + + +void qa_16s_add_quad_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3200; + const int ITERS = 100000; + short input0[vlen] __attribute__ ((aligned (16))); + short input1[vlen] __attribute__ ((aligned (16))); + short input2[vlen] __attribute__ ((aligned (16))); + short input3[vlen] __attribute__ ((aligned (16))); + short input4[vlen] __attribute__ ((aligned (16))); + + short output0[vlen] __attribute__ ((aligned (16))); + short output1[vlen] __attribute__ ((aligned (16))); + short output2[vlen] __attribute__ ((aligned (16))); + short output3[vlen] __attribute__ ((aligned (16))); + short output01[vlen] __attribute__ ((aligned (16))); + short output11[vlen] __attribute__ ((aligned (16))); + short output21[vlen] __attribute__ ((aligned (16))); + short output31[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus1 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus1 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus2 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus2 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus3 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; + + input0[i] = plus0 - minus0; + input1[i] = plus1 - minus1; + input2[i] = plus2 - minus2; + input3[i] = plus3 - minus3; + input4[i] = plus4 - minus4; + + } + printf("16s_add_quad_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_add_quad_aligned16_manual(output0, output1, output2, output3, input0, input1, input2, input3, input4, vlen << 1 , "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_add_quad_aligned16_manual(output01, output11, output21, output31, input0, input1, input2, input3, input4, vlen << 1 , "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output1[i], output11[i]); + CPPUNIT_ASSERT_EQUAL(output2[i], output21[i]); + CPPUNIT_ASSERT_EQUAL(output3[i], output31[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_add_quad_aligned16.h b/volk/lib/qa_16s_add_quad_aligned16.h new file mode 100644 index 000000000..3c1ae978b --- /dev/null +++ b/volk/lib/qa_16s_add_quad_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H +#define INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H + +#include +#include + +class qa_16s_add_quad_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_add_quad_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc new file mode 100644 index 000000000..ba5e8ed93 --- /dev/null +++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc @@ -0,0 +1,106 @@ +#include +#include +#include +#include + +//test for ssse3 + +#ifndef LV_HAVE_SSSE3 + +void qa_16s_branch_4_state_8_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16s_branch_4_state_8_aligned16::t1() { + const int num_iters = 1000000; + const int vlen = 32; + + static char permute0[16]__attribute__((aligned(16))) = {0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01, 0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03}; + static char permute1[16]__attribute__((aligned(16))) = {0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03, 0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01}; + static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f}; + static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d}; + static char* permuters[4] = {permute0, permute1, permute2, permute3}; + + unsigned int num_bytes = vlen << 1; + + volk_environment_init(); + clock_t start, end; + double total; + + short target[vlen] __attribute__ ((aligned (16))); + short target2[vlen] __attribute__ ((aligned (16))); + short target3[vlen] __attribute__ ((aligned (16))); + + short src0[vlen] __attribute__ ((aligned (16))); + short permute_indexes[vlen] __attribute__ ((aligned (16))) = { +7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 }; + short cntl0[vlen] __attribute__ ((aligned (16))) = { + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; + short cntl1[vlen] __attribute__ ((aligned (16))) = { + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; + short cntl2[vlen] __attribute__ ((aligned (16))) = { + 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000 }; + short cntl3[vlen] __attribute__ ((aligned (16))) = { + 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff }; + short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4}; + + + + for(int i = 0; i < vlen; ++i) { + src0[i] = i; + + } + + + printf("16s_branch_4_state_8_aligned\n"); + + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("permute_and_scalar_add_time: %f\n", total); + + + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_branch_4_state_8_aligned16_manual(target2, src0, permuters, cntl2, cntl3, scalars, "ssse3"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("branch_4_state_8_time, ssse3: %f\n", total); + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("permute_and_scalar_add_time, generic: %f\n", total); + + + + for(int i = 0; i < vlen; ++i) { + printf("psa... %d, b4s8... %d\n", target[i], target3[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT(target[i] == target2[i]); + CPPUNIT_ASSERT(target[i] == target3[i]); + } +} + + +#endif diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.h b/volk/lib/qa_16s_branch_4_state_8_aligned16.h new file mode 100644 index 000000000..41ab073e0 --- /dev/null +++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H +#define INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H + +#include +#include + +class qa_16s_branch_4_state_8_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_branch_4_state_8_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_32f_aligned16.cc b/volk/lib/qa_16s_convert_32f_aligned16.cc new file mode 100644 index 000000000..7878d4737 --- /dev/null +++ b/volk/lib/qa_16s_convert_32f_aligned16.cc @@ -0,0 +1,73 @@ +#include +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE + +void qa_16s_convert_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_16s_convert_32f_aligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int16_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); + } + printf("16s_convert_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_16s_convert_32f_aligned16(output_sse4_1, input0, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_convert_32f_aligned16.h b/volk/lib/qa_16s_convert_32f_aligned16.h new file mode 100644 index 000000000..ef813d96f --- /dev/null +++ b/volk/lib/qa_16s_convert_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H +#define INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H + +#include +#include + +class qa_16s_convert_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_convert_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_32f_unaligned16.cc b/volk/lib/qa_16s_convert_32f_unaligned16.cc new file mode 100644 index 000000000..8c3121e5c --- /dev/null +++ b/volk/lib/qa_16s_convert_32f_unaligned16.cc @@ -0,0 +1,73 @@ +#include +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE + +void qa_16s_convert_32f_unaligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_16s_convert_32f_unaligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int16_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); + } + printf("16s_convert_32f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_32f_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_32f_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_16s_convert_32f_unaligned16(output_sse4_1, input0, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_convert_32f_unaligned16.h b/volk/lib/qa_16s_convert_32f_unaligned16.h new file mode 100644 index 000000000..aeb04f770 --- /dev/null +++ b/volk/lib/qa_16s_convert_32f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H +#define INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H + +#include +#include + +class qa_16s_convert_32f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_convert_32f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_8s_aligned16.cc b/volk/lib/qa_16s_convert_8s_aligned16.cc new file mode 100644 index 000000000..734b7784e --- /dev/null +++ b/volk/lib/qa_16s_convert_8s_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_convert_8s_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_convert_8s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int16_t input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); + } + printf("16s_convert_8s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_8s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_8s_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d -> %d...%d\n", input0[i], output_generic[i], output_sse2[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_convert_8s_aligned16.h b/volk/lib/qa_16s_convert_8s_aligned16.h new file mode 100644 index 000000000..2e409d0cc --- /dev/null +++ b/volk/lib/qa_16s_convert_8s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H +#define INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H + +#include +#include + +class qa_16s_convert_8s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_convert_8s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_8s_unaligned16.cc b/volk/lib/qa_16s_convert_8s_unaligned16.cc new file mode 100644 index 000000000..275ab7668 --- /dev/null +++ b/volk/lib/qa_16s_convert_8s_unaligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_convert_8s_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_convert_8s_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int16_t input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); + } + printf("16s_convert_8s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_8s_unaligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_convert_8s_unaligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_convert_8s_unaligned16.h b/volk/lib/qa_16s_convert_8s_unaligned16.h new file mode 100644 index 000000000..4b2fe9e42 --- /dev/null +++ b/volk/lib/qa_16s_convert_8s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H +#define INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H + +#include +#include + +class qa_16s_convert_8s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_convert_8s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H */ diff --git a/volk/lib/qa_16s_max_star_aligned16.cc b/volk/lib/qa_16s_max_star_aligned16.cc new file mode 100644 index 000000000..b46b9ae8e --- /dev/null +++ b/volk/lib/qa_16s_max_star_aligned16.cc @@ -0,0 +1,65 @@ +#include +#include +#include +#include +#include +//test for ssse3 + +#ifndef LV_HAVE_SSSE3 + +void qa_16s_max_star_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + + + +void qa_16s_max_star_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 6400; + const int ITERS = 100000; + short input0[vlen] __attribute__ ((aligned (16))); + short output0[1] __attribute__ ((aligned (16))); + + short output1[1] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + + short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + + input0[i] = plus0 - minus0; + + } + printf("16s_max_star_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_max_star_aligned16_manual(output0, input0, vlen << 1, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_max_star_aligned16_manual(output1, input0, vlen << 1, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < 1; ++i) { + + CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_max_star_aligned16.h b/volk/lib/qa_16s_max_star_aligned16.h new file mode 100644 index 000000000..119f87c4d --- /dev/null +++ b/volk/lib/qa_16s_max_star_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H +#define INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H + +#include +#include + +class qa_16s_max_star_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_max_star_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_max_star_horizontal_aligned16.cc b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc new file mode 100644 index 000000000..4d44735df --- /dev/null +++ b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc @@ -0,0 +1,79 @@ +#include +#include +#include +#include +#include +#include +//test for ssse3 + +#ifndef LV_HAVE_SSSE3 + +void qa_16s_max_star_horizontal_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + + +void qa_16s_max_star_horizontal_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 32; + const int ITERS = 1; + short input0[vlen] __attribute__ ((aligned (16))); + short output0[vlen>>1] __attribute__ ((aligned (16))); + + short output1[vlen>>1] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + short plus0 = ((short) (rand() - (RAND_MAX/2))); + + short minus0 = ((short) (rand() - (RAND_MAX/2))); + + input0[i] = plus0 - minus0; + + } + printf("16s_max_star_horizontal_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16s_max_star_horizontal_aligned16_manual(output0, input0, 2*vlen, "generic"); + volk_16s_max_star_horizontal_aligned16_manual(output0, output0, vlen, "generic"); + volk_16s_max_star_horizontal_aligned16_manual(output0, output0, vlen/2, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + + get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, input0, 2*vlen); + get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, output1, vlen); + get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, output1, vlen); + /* volk_16s_max_star_horizontal_aligned16(output1, input0, 2*vlen, "ssse3"); + volk_16s_max_star_horizontal_aligned16(output1, output1, vlen, "ssse3"); + volk_16s_max_star_horizontal_aligned16(output1, output1, vlen, "ssse3");*/ + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < (vlen >> 1); ++i) { + // printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + + } + for(int i = 0; i < (vlen >> 1); ++i) { + + CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); + } + } + + +#endif + diff --git a/volk/lib/qa_16s_max_star_horizontal_aligned16.h b/volk/lib/qa_16s_max_star_horizontal_aligned16.h new file mode 100644 index 000000000..9f9757253 --- /dev/null +++ b/volk/lib/qa_16s_max_star_horizontal_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H +#define INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H + +#include +#include + +class qa_16s_max_star_horizontal_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_max_star_horizontal_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc new file mode 100644 index 000000000..3c4f5c6cc --- /dev/null +++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc @@ -0,0 +1,78 @@ +#include +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_permute_and_scalar_add_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_permute_and_scalar_add_aligned16::t1() { + const int vlen = 64; + + unsigned int num_bytes = vlen << 1; + + volk_environment_init(); + clock_t start, end; + double total; + + short target[vlen] __attribute__ ((aligned (16))); + short target2[vlen] __attribute__ ((aligned (16))); + short src0[vlen] __attribute__ ((aligned (16))); + short permute_indexes[vlen] __attribute__ ((aligned (16))); + short cntl0[vlen] __attribute__ ((aligned (16))); + short cntl1[vlen] __attribute__ ((aligned (16))); + short cntl2[vlen] __attribute__ ((aligned (16))); + short cntl3[vlen] __attribute__ ((aligned (16))); + short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4}; + + for(int i = 0; i < vlen; ++i) { + src0[i] = i; + permute_indexes[i] = (3 * i)%vlen; + cntl0[i] = 0xff; + cntl1[i] = 0xff * (i%2); + cntl2[i] = 0xff * ((i>>1)%2); + cntl3[i] = 0xff * ((i%4) == 3); + } + + printf("16s_permute_and_scalar_add_aligned\n"); + + start = clock(); + for(int i = 0; i < 100000; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("generic_time: %f\n", total); + + start = clock(); + for(int i = 0; i < 100000; ++i) { + volk_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("sse2_time: %f\n", total); + + + for(int i = 0; i < vlen; ++i) { + //printf("generic... %d, sse2... %d\n", target[i], target2[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT(target[i] == target2[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h new file mode 100644 index 000000000..3643aeef6 --- /dev/null +++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H +#define INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H + +#include +#include + +class qa_16s_permute_and_scalar_add_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_permute_and_scalar_add_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc new file mode 100644 index 000000000..80a220c93 --- /dev/null +++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc @@ -0,0 +1,59 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_quad_max_star_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_quad_max_star_aligned16::t1() { + const int vlen = 34; + + short input0[vlen] __attribute__ ((aligned (16))); + short input1[vlen] __attribute__ ((aligned (16))); + short input2[vlen] __attribute__ ((aligned (16))); + short input3[vlen] __attribute__ ((aligned (16))); + + short output0[vlen] __attribute__ ((aligned (16))); + short output1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + short plus0 = (short) (rand() - (RAND_MAX/2)); + short plus1 = (short) (rand() - (RAND_MAX/2)); + short plus2 = (short) (rand() - (RAND_MAX/2)); + short plus3 = (short) (rand() - (RAND_MAX/2)); + + short minus0 = (short) (rand() - (RAND_MAX/2)); + short minus1 = (short) (rand() - (RAND_MAX/2)); + short minus2 = (short) (rand() - (RAND_MAX/2)); + short minus3 = (short) (rand() - (RAND_MAX/2)); + + input0[i] = plus0 - minus0; + input1[i] = plus1 - minus1; + input2[i] = plus2 - minus2; + input3[i] = plus3 - minus3; + } + + volk_16s_quad_max_star_aligned16_manual(output0, input0, input1, input2, input3, 2*vlen, "generic"); + + volk_16s_quad_max_star_aligned16_manual(output1, input0, input1, input2, input3, 2*vlen, "sse2"); + + printf("16s_quad_max_star_aligned\n"); + for(int i = 0; i < vlen; ++i) { + printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); + } +} + +#endif diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.h b/volk/lib/qa_16s_quad_max_star_aligned16.h new file mode 100644 index 000000000..51e77081a --- /dev/null +++ b/volk/lib/qa_16s_quad_max_star_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H +#define INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H + +#include +#include + +class qa_16s_quad_max_star_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_quad_max_star_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc new file mode 100644 index 000000000..e700ac72c --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc @@ -0,0 +1,76 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSSE3 + +void qa_16sc_deinterleave_16s_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_generic1[vlen] __attribute__ ((aligned (16))); + int16_t output_sse2[vlen] __attribute__ ((aligned (16))); + int16_t output_sse21[vlen] __attribute__ ((aligned (16))); + int16_t output_ssse3[vlen] __attribute__ ((aligned (16))); + int16_t output_ssse31[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] = ((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32678.0)); + } + printf("16sc_deinterleave_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_16s_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_16s_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_16s_aligned16_manual(output_ssse3, output_ssse31, input0, vlen, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_sse21[i]); + + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]); + CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_ssse31[i]); + } +} + +#endif diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.h b/volk/lib/qa_16sc_deinterleave_16s_aligned16.h new file mode 100644 index 000000000..995ab5b34 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H + +#include +#include + +class qa_16sc_deinterleave_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc new file mode 100644 index 000000000..6ee076998 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_16sc_deinterleave_32f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_generic1[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + float output_sse21[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0)); + } + printf("16sc_deinterleave_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_32f_aligned16_manual(output_sse2, output_sse21, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse21[i], fabs(output_generic1[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.h b/volk/lib/qa_16sc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..fea3b6c2d --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H + +#include +#include + +class qa_16sc_deinterleave_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc new file mode 100644 index 000000000..ca048ea67 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc @@ -0,0 +1,71 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSSE3 + +void qa_16sc_deinterleave_real_16s_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_real_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse2[vlen] __attribute__ ((aligned (16))); + int16_t output_ssse3[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] = ((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32678.0)); + } + printf("16sc_deinterleave_real_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_16s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_16s_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_16s_aligned16_manual(output_ssse3, input0, vlen, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < vlen; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + // printf("%d = generic... %d, sse2... %d, ssse3... %d\n", i, output_generic[i], output_sse2[i], output_ssse3[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_ssse3[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..ebb70b97a --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H + +#include +#include + +class qa_16sc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc new file mode 100644 index 000000000..0f4ba6923 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc @@ -0,0 +1,123 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +#ifndef LV_HAVE_SSE + +void qa_16sc_deinterleave_real_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_real_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0)); + } + printf("16sc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + } +} + +#endif /* SSE */ + +#else + +void qa_16sc_deinterleave_real_32f_aligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0); + } + printf("16sc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_16sc_deinterleave_real_32f_aligned16(output_sse4_1, input0, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4); + } +} + +#endif /* SSE4_1 */ diff --git a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..e83426473 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H + +#include +#include + +class qa_16sc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc new file mode 100644 index 000000000..5ab458bc9 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSSE3 + +void qa_16sc_deinterleave_real_8s_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_deinterleave_real_8s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_ssse3[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0); + } + printf("16sc_deinterleave_real_8s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_8s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]); + } +} + +#endif diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h new file mode 100644 index 000000000..04e5511e5 --- /dev/null +++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H +#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H + +#include +#include + +class qa_16sc_deinterleave_real_8s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_8s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc new file mode 100644 index 000000000..b14610757 --- /dev/null +++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE3 + +void qa_16sc_magnitude_16s_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_magnitude_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + int16_t output_sse3[vlen] __attribute__ ((aligned (16))); + + int16_t* loadInput = (int16_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0)); + } + printf("16sc_magnitude_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_16s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_16s_aligned16_manual(output_sse, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_16s_aligned16_manual(output_sse3, input0, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1); + } +} + +#endif diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.h b/volk/lib/qa_16sc_magnitude_16s_aligned16.h new file mode 100644 index 000000000..4664b70f4 --- /dev/null +++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H +#define INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H + +#include +#include + +class qa_16sc_magnitude_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_magnitude_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc new file mode 100644 index 000000000..06dff2fd5 --- /dev/null +++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE3 + +void qa_16sc_magnitude_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_16sc_magnitude_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse3[vlen] __attribute__ ((aligned (16))); + + int16_t* inputLoad = (int16_t*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("16sc_magnitude_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_32f_aligned16_manual(output_sse3, input0, 32768.0, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.h b/volk/lib/qa_16sc_magnitude_32f_aligned16.h new file mode 100644 index 000000000..0c25673ea --- /dev/null +++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H +#define INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H + +#include +#include + +class qa_16sc_magnitude_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16sc_magnitude_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16u_byteswap_aligned16.cc b/volk/lib/qa_16u_byteswap_aligned16.cc new file mode 100644 index 000000000..6b19828a4 --- /dev/null +++ b/volk/lib/qa_16u_byteswap_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_16u_byteswap_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16u_byteswap_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100001; + + uint16_t output0[vlen] __attribute__ ((aligned (16))); + uint16_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + output0[i] = (uint16_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2)); + } + memcpy(output01, output0, vlen*sizeof(uint16_t)); + + printf("16u_byteswap_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16u_byteswap_aligned16_manual(output0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16u_byteswap_aligned16_manual(output01, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_16u_byteswap_aligned16.h b/volk/lib/qa_16u_byteswap_aligned16.h new file mode 100644 index 000000000..e11b23e3f --- /dev/null +++ b/volk/lib/qa_16u_byteswap_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H +#define INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H + +#include +#include + +class qa_16u_byteswap_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16u_byteswap_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_accumulator_aligned16.cc b/volk/lib/qa_32f_accumulator_aligned16.cc new file mode 100644 index 000000000..ea637d600 --- /dev/null +++ b/volk/lib/qa_32f_accumulator_aligned16.cc @@ -0,0 +1,56 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_accumulator_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_accumulator_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float accumulator_generic; + float accumulator_sse; + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_accumulator_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_accumulator_aligned16_manual(&accumulator_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_accumulator_aligned16_manual(&accumulator_sse, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(accumulator_generic, accumulator_sse, fabs(accumulator_generic)*1e-4); +} + +#endif diff --git a/volk/lib/qa_32f_accumulator_aligned16.h b/volk/lib/qa_32f_accumulator_aligned16.h new file mode 100644 index 000000000..0004d3ff0 --- /dev/null +++ b/volk/lib/qa_32f_accumulator_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H +#define INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H + +#include +#include + +class qa_32f_accumulator_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_accumulator_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_add_aligned16.cc b/volk/lib/qa_32f_add_aligned16.cc new file mode 100644 index 000000000..92f35c7ec --- /dev/null +++ b/volk/lib/qa_32f_add_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_add_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_add_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_add_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_add_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_add_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_add_aligned16.h b/volk/lib/qa_32f_add_aligned16.h new file mode 100644 index 000000000..58e2a151c --- /dev/null +++ b/volk/lib/qa_32f_add_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_ADD_ALIGNED16_H +#define INCLUDED_QA_32F_ADD_ALIGNED16_H + +#include +#include + +class qa_32f_add_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_add_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_ADD_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc new file mode 100644 index 000000000..3c8137004 --- /dev/null +++ b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_calc_spectral_noise_floor_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_calc_spectral_noise_floor_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float output0[1] __attribute__ ((aligned (16))); + float output01[1] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_calc_spectral_noise_floor_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_calc_spectral_noise_floor_aligned16_manual(output0, input0, 20, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_calc_spectral_noise_floor_aligned16_manual(output01, input0, 20, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < 1; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h new file mode 100644 index 000000000..c5dce2c4b --- /dev/null +++ b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H +#define INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H + +#include +#include + +class qa_32f_calc_spectral_noise_floor_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_calc_spectral_noise_floor_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_16s_aligned16.cc b/volk/lib/qa_32f_convert_16s_aligned16.cc new file mode 100644 index 000000000..84a4c40c4 --- /dev/null +++ b/volk/lib/qa_32f_convert_16s_aligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_16s_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + int16_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < vlen; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("%d generic... %d, sse... %d sse2... %d\n", i, output_generic[i], output_sse[i], output_sse2[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_16s_aligned16.h b/volk/lib/qa_32f_convert_16s_aligned16.h new file mode 100644 index 000000000..fce1eb417 --- /dev/null +++ b/volk/lib/qa_32f_convert_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H + +#include +#include + +class qa_32f_convert_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_16s_unaligned16.cc b/volk/lib/qa_32f_convert_16s_unaligned16.cc new file mode 100644 index 000000000..9469daed2 --- /dev/null +++ b/volk/lib/qa_32f_convert_16s_unaligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_16s_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_16s_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + int16_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_16s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_16s_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_16s_unaligned16.h b/volk/lib/qa_32f_convert_16s_unaligned16.h new file mode 100644 index 000000000..492bc80e6 --- /dev/null +++ b/volk/lib/qa_32f_convert_16s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H + +#include +#include + +class qa_32f_convert_16s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_16s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_32s_aligned16.cc b/volk/lib/qa_32f_convert_32s_aligned16.cc new file mode 100644 index 000000000..ff24c7b0d --- /dev/null +++ b/volk/lib/qa_32f_convert_32s_aligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_32s_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_32s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int32_t output_generic[vlen] __attribute__ ((aligned (16))); + int32_t output_sse[vlen] __attribute__ ((aligned (16))); + int32_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_32s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_32s_aligned16.h b/volk/lib/qa_32f_convert_32s_aligned16.h new file mode 100644 index 000000000..97d854463 --- /dev/null +++ b/volk/lib/qa_32f_convert_32s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H + +#include +#include + +class qa_32f_convert_32s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_32s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_32s_unaligned16.cc b/volk/lib/qa_32f_convert_32s_unaligned16.cc new file mode 100644 index 000000000..e63b17994 --- /dev/null +++ b/volk/lib/qa_32f_convert_32s_unaligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_32s_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_32s_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int32_t output_generic[vlen] __attribute__ ((aligned (16))); + int32_t output_sse[vlen] __attribute__ ((aligned (16))); + int32_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_32s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_32s_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_32s_unaligned16.h b/volk/lib/qa_32f_convert_32s_unaligned16.h new file mode 100644 index 000000000..5d662d86d --- /dev/null +++ b/volk/lib/qa_32f_convert_32s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H + +#include +#include + +class qa_32f_convert_32s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_32s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_64f_aligned16.cc b/volk/lib/qa_32f_convert_64f_aligned16.cc new file mode 100644 index 000000000..c546e47de --- /dev/null +++ b/volk/lib/qa_32f_convert_64f_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_64f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_64f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + double output_generic[vlen] __attribute__ ((aligned (16))); + double output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_64f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_64f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_64f_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i] ,output_sse2[i], fabs(output_generic[i])*1e-6); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_64f_aligned16.h b/volk/lib/qa_32f_convert_64f_aligned16.h new file mode 100644 index 000000000..41eb3e094 --- /dev/null +++ b/volk/lib/qa_32f_convert_64f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H + +#include +#include + +class qa_32f_convert_64f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_64f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_64f_unaligned16.cc b/volk/lib/qa_32f_convert_64f_unaligned16.cc new file mode 100644 index 000000000..24b51f9af --- /dev/null +++ b/volk/lib/qa_32f_convert_64f_unaligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_64f_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_64f_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + double output_generic[vlen] __attribute__ ((aligned (16))); + double output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_64f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_64f_unaligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_64f_unaligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_64f_unaligned16.h b/volk/lib/qa_32f_convert_64f_unaligned16.h new file mode 100644 index 000000000..4b144f033 --- /dev/null +++ b/volk/lib/qa_32f_convert_64f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H + +#include +#include + +class qa_32f_convert_64f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_64f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_8s_aligned16.cc b/volk/lib/qa_32f_convert_8s_aligned16.cc new file mode 100644 index 000000000..a3d4d6567 --- /dev/null +++ b/volk/lib/qa_32f_convert_8s_aligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_8s_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_8s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_sse[vlen] __attribute__ ((aligned (16))); + int8_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_8s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_aligned16_manual(output_generic, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_aligned16_manual(output_sse, input0, 128.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_aligned16_manual(output_sse2, input0, 128.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_8s_aligned16.h b/volk/lib/qa_32f_convert_8s_aligned16.h new file mode 100644 index 000000000..68a523f34 --- /dev/null +++ b/volk/lib/qa_32f_convert_8s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H + +#include +#include + +class qa_32f_convert_8s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_8s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_8s_unaligned16.cc b/volk/lib/qa_32f_convert_8s_unaligned16.cc new file mode 100644 index 000000000..d885fd6bb --- /dev/null +++ b/volk/lib/qa_32f_convert_8s_unaligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32f_convert_8s_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_convert_8s_unaligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_sse[vlen] __attribute__ ((aligned (16))); + int8_t output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_convert_8s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_unaligned16_manual(output_generic, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_unaligned16_manual(output_sse, input0, 128.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_convert_8s_unaligned16_manual(output_sse2, input0, 128.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); + CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); + } +} + +#endif diff --git a/volk/lib/qa_32f_convert_8s_unaligned16.h b/volk/lib/qa_32f_convert_8s_unaligned16.h new file mode 100644 index 000000000..88d4ff42a --- /dev/null +++ b/volk/lib/qa_32f_convert_8s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H +#define INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H + +#include +#include + +class qa_32f_convert_8s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_convert_8s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc new file mode 100644 index 000000000..b20999beb --- /dev/null +++ b/volk/lib/qa_32f_divide_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_divide_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_divide_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_divide_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_divide_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_divide_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_divide_aligned16.h b/volk/lib/qa_32f_divide_aligned16.h new file mode 100644 index 000000000..79d5ae4b8 --- /dev/null +++ b/volk/lib/qa_32f_divide_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_DIVIDE_ALIGNED16_H +#define INCLUDED_QA_32F_DIVIDE_ALIGNED16_H + +#include +#include + +class qa_32f_divide_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_divide_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_DIVIDE_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_dot_prod_aligned16.cc b/volk/lib/qa_32f_dot_prod_aligned16.cc new file mode 100644 index 000000000..98c1f2d99 --- /dev/null +++ b/volk/lib/qa_32f_dot_prod_aligned16.cc @@ -0,0 +1,183 @@ +#include +#include +#include +#include +#include +#include + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifndef LV_HAVE_SSE4_1 + +#ifdef LV_HAVE_SSE3 +void qa_32f_dot_prod_aligned16::t1() { + const int vlen = 2046; + const int ITER = 100000; + + int i; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float * input; + float * taps; + + float * result_generic; + float * result_sse; + float * result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen* sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen *sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); + + random_floats((float*)input, vlen); + random_floats((float*)taps, vlen); + + + printf("32f_dot_prod_aligned16\n"); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_generic[i], input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_sse[i], input, taps, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + printf("generic: %f ... sse: %f ... sse3 %f \n", result_generic[0], result_sse[0], result_sse3[0]); + + for(i = 0; i < ITER; i++){ + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse); + free(result_sse3); + +} +#else +void qa_32f_dot_prod_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE3 */ + +#else + +void qa_32f_dot_prod_aligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 4095; + const int ITER = 100000; + + int i; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float * input; + float * taps; + + float * result_generic; + float * result_sse; + float * result_sse3; + float * result_sse4_1; + + ret = posix_memalign((void**)&input, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse4_1, 16, ITER*sizeof(float)); + + random_floats((float*)input, vlen); + random_floats((float*)taps, vlen); + + printf("32f_dot_prod_aligned16\n"); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_generic[i], input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_sse[i], input, taps, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_aligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + get_volk_runtime()->volk_32f_dot_prod_aligned16(&result_sse4_1[i], input, taps, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + //printf("generic: %f ... sse: %f ... sse3 %f ... sse4_1 %f \n", result_generic[0], result_sse[0], result_sse3[0], result_sse4_1[0]); + for(i =0; i < ITER; i++){ + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse4_1[i], fabs(result_generic[i])*ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse); + free(result_sse3); + free(result_sse4_1); + +} + +#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32f_dot_prod_aligned16.h b/volk/lib/qa_32f_dot_prod_aligned16.h new file mode 100644 index 000000000..6931a9e98 --- /dev/null +++ b/volk/lib/qa_32f_dot_prod_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H +#define INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H + +#include +#include + +class qa_32f_dot_prod_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_dot_prod_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_dot_prod_unaligned16.cc b/volk/lib/qa_32f_dot_prod_unaligned16.cc new file mode 100644 index 000000000..8e97d4249 --- /dev/null +++ b/volk/lib/qa_32f_dot_prod_unaligned16.cc @@ -0,0 +1,190 @@ +#include +#include +#include +#include +#include +#include + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifndef LV_HAVE_SSE4_1 + +#ifdef LV_HAVE_SSE3 +void qa_32f_dot_prod_unaligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 2046; + const int ITER = 100000; + + int i; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float * input; + float * taps; + + float * result_generic; + float * result_sse; + float * result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen* sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen *sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); + + random_floats((float*)input, vlen); + random_floats((float*)taps, vlen); + + + printf("32f_dot_prod_unaligned16\n"); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_generic[i], input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_sse[i], input, taps, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + printf("generic: %f ... sse: %f ... sse3 %f \n", result_generic[0], result_sse[0], result_sse3[0]); + + for(i = 0; i < ITER; i++){ + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse); + free(result_sse3); + +} +#else +void qa_32f_dot_prod_unaligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE3 */ + +#else + +void qa_32f_dot_prod_unaligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 4095; + const int ITER = 100000; + + int i; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float * input; + float * taps; + + float * result_generic; + float * result_sse; + float * result_sse3; + float * result_sse4_1; + + ret = posix_memalign((void**)&input, 16, (vlen+1) * sizeof(float)); + ret = posix_memalign((void**)&taps, 16, (vlen+1) * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); + ret = posix_memalign((void**)&result_sse4_1, 16, ITER*sizeof(float)); + + input = &input[1]; // Make sure the buffer is unaligned + taps = &taps[1]; // Make sure the buffer is unaligned + + random_floats((float*)input, vlen); + random_floats((float*)taps, vlen); + + printf("32f_dot_prod_unaligned16\n"); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_generic[i], input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_sse[i], input, taps, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + volk_32f_dot_prod_unaligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + start = clock(); + for(i = 0; i < ITER; i++){ + get_volk_runtime()->volk_32f_dot_prod_unaligned16(&result_sse4_1[i], input, taps, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + //printf("generic: %f ... sse: %f ... sse3 %f ... sse4_1 %f \n", result_generic[0], result_sse[0], result_sse3[0], result_sse4_1[0]); + for(i =0; i < ITER; i++){ + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse4_1[i], fabs(result_generic[i])*ERR_DELTA); + } + + free(&input[-1]); + free(&taps[-1]); + free(result_generic); + free(result_sse); + free(result_sse3); + free(result_sse4_1); + +} + +#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32f_dot_prod_unaligned16.h b/volk/lib/qa_32f_dot_prod_unaligned16.h new file mode 100644 index 000000000..e8bad07fe --- /dev/null +++ b/volk/lib/qa_32f_dot_prod_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H +#define INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H + +#include +#include + +class qa_32f_dot_prod_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_dot_prod_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc new file mode 100644 index 000000000..ca65add28 --- /dev/null +++ b/volk/lib/qa_32f_fm_detect_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_fm_detect_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_fm_detect_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + float input0[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_fm_detect_aligned\n"); + + start = clock(); + float save = 0.1; + for(int count = 0; count < ITERS; ++count) { + volk_32f_fm_detect_aligned16_manual(output0, input0, 1.0, &save, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + save = 0.1; + for(int count = 0; count < ITERS; ++count) { + volk_32f_fm_detect_aligned16_manual(output01, input0, 1.0, &save, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32f_fm_detect_aligned16.h b/volk/lib/qa_32f_fm_detect_aligned16.h new file mode 100644 index 000000000..a2680c524 --- /dev/null +++ b/volk/lib/qa_32f_fm_detect_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H +#define INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H + +#include +#include + +class qa_32f_fm_detect_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_fm_detect_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_index_max_aligned16.cc b/volk/lib/qa_32f_index_max_aligned16.cc new file mode 100644 index 000000000..a1c3d4cd1 --- /dev/null +++ b/volk/lib/qa_32f_index_max_aligned16.cc @@ -0,0 +1,103 @@ +#include +#include +#include +#include +#include +#include + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 1000000 +#define VEC_LEN 3097 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE + +void qa_32f_index_max_aligned16::t1(){ + printf("sse not available... no test performed\n"); +} + +#else + + +void qa_32f_index_max_aligned16::t1(){ + + const int vlen = VEC_LEN; + + + volk_runtime_init(); + + volk_environment_init(); + int ret; + + unsigned int* target_sse4_1; + unsigned int* target_sse; + unsigned int* target_generic; + float* src0 ; + + + unsigned int i_target_sse4_1; + target_sse4_1 = &i_target_sse4_1; + unsigned int i_target_sse; + target_sse = &i_target_sse; + unsigned int i_target_generic; + target_generic = &i_target_generic; + + ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float)); + + random_floats((float*)src0, vlen); + + printf("32f_index_max_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + get_volk_runtime()->volk_32f_index_max_aligned16(target_sse4_1, src0, vlen); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1 time: %f\n", total); + + + printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]); + CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]); + CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]); + + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32f_index_max_aligned16.h b/volk/lib/qa_32f_index_max_aligned16.h new file mode 100644 index 000000000..8cadffa47 --- /dev/null +++ b/volk/lib/qa_32f_index_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H +#define INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H + +#include +#include + +class qa_32f_index_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_index_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_interleave_16sc_aligned16.cc b/volk/lib/qa_32f_interleave_16sc_aligned16.cc new file mode 100644 index 000000000..2a937637f --- /dev/null +++ b/volk/lib/qa_32f_interleave_16sc_aligned16.cc @@ -0,0 +1,75 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_32f_interleave_16sc_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32f_interleave_16sc_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + std::complex output_generic[vlen] __attribute__ ((aligned (16))); + std::complex output_sse[vlen] __attribute__ ((aligned (16))); + std::complex output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_interleave_16sc_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_16sc_aligned16_manual(output_generic, input0, input1, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_16sc_aligned16_manual(output_sse, input0, input1, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_16sc_aligned16_manual(output_sse2, input0, input1, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse[i]), 1.01); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse[i]), 1.01); + + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse2[i]), 1.01); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse2[i]), 1.01); + } +} + +#endif diff --git a/volk/lib/qa_32f_interleave_16sc_aligned16.h b/volk/lib/qa_32f_interleave_16sc_aligned16.h new file mode 100644 index 000000000..8d2914817 --- /dev/null +++ b/volk/lib/qa_32f_interleave_16sc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H +#define INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H + +#include +#include + +class qa_32f_interleave_16sc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_interleave_16sc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_interleave_32fc_aligned16.cc b/volk/lib/qa_32f_interleave_32fc_aligned16.cc new file mode 100644 index 000000000..c22dd1046 --- /dev/null +++ b/volk/lib/qa_32f_interleave_32fc_aligned16.cc @@ -0,0 +1,62 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_interleave_32fc_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_interleave_32fc_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + std::complex output_generic[vlen] __attribute__ ((aligned (16))); + std::complex output_sse[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_interleave_32fc_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_32fc_aligned16_manual(output_generic, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_interleave_32fc_aligned16_manual(output_sse, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse[i]), fabs(std::real(output_generic[i]))*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse[i]), fabs(std::imag(output_generic[i]))*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32f_interleave_32fc_aligned16.h b/volk/lib/qa_32f_interleave_32fc_aligned16.h new file mode 100644 index 000000000..cba518d37 --- /dev/null +++ b/volk/lib/qa_32f_interleave_32fc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H +#define INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H + +#include +#include + +class qa_32f_interleave_32fc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_interleave_32fc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_max_aligned16.cc b/volk/lib/qa_32f_max_aligned16.cc new file mode 100644 index 000000000..3ef375176 --- /dev/null +++ b/volk/lib/qa_32f_max_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_max_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_max_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_max_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_max_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_max_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_max_aligned16.h b/volk/lib/qa_32f_max_aligned16.h new file mode 100644 index 000000000..d535479f4 --- /dev/null +++ b/volk/lib/qa_32f_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_MAX_ALIGNED16_H +#define INCLUDED_QA_32F_MAX_ALIGNED16_H + +#include +#include + +class qa_32f_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_min_aligned16.cc b/volk/lib/qa_32f_min_aligned16.cc new file mode 100644 index 000000000..617e18b24 --- /dev/null +++ b/volk/lib/qa_32f_min_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_min_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_min_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_min_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_min_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_min_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_min_aligned16.h b/volk/lib/qa_32f_min_aligned16.h new file mode 100644 index 000000000..90961ac92 --- /dev/null +++ b/volk/lib/qa_32f_min_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_MIN_ALIGNED16_H +#define INCLUDED_QA_32F_MIN_ALIGNED16_H + +#include +#include + +class qa_32f_min_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_min_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_MIN_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_multiply_aligned16.cc b/volk/lib/qa_32f_multiply_aligned16.cc new file mode 100644 index 000000000..c77fe97da --- /dev/null +++ b/volk/lib/qa_32f_multiply_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_multiply_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_multiply_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_multiply_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_multiply_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_multiply_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_multiply_aligned16.h b/volk/lib/qa_32f_multiply_aligned16.h new file mode 100644 index 000000000..7032a2ad4 --- /dev/null +++ b/volk/lib/qa_32f_multiply_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H +#define INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H + +#include +#include + +class qa_32f_multiply_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_multiply_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_normalize_aligned16.cc b/volk/lib/qa_32f_normalize_aligned16.cc new file mode 100644 index 000000000..2954fc3ae --- /dev/null +++ b/volk/lib/qa_32f_normalize_aligned16.cc @@ -0,0 +1,65 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_normalize_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_normalize_aligned16::t1() { + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + const int vlen = 320001; + const int ITERS = 100; + + float* output0; + float* output01; + ret = posix_memalign((void**)&output0, 16, vlen*sizeof(float)); + ret = posix_memalign((void**)&output01, 16, vlen*sizeof(float)); + + for(int i = 0; i < vlen; ++i) { + output0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + memcpy(output01, output0, vlen*sizeof(float)); + printf("32f_normalize_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_normalize_aligned16_manual(output0, 1.15, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_normalize_aligned16_manual(output01, 1.15, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + // printf("%e...%e\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4); + } + + free(output0); + free(output01); +} + +#endif diff --git a/volk/lib/qa_32f_normalize_aligned16.h b/volk/lib/qa_32f_normalize_aligned16.h new file mode 100644 index 000000000..7c421eb82 --- /dev/null +++ b/volk/lib/qa_32f_normalize_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H +#define INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H + +#include +#include + +class qa_32f_normalize_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_normalize_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_power_aligned16.cc b/volk/lib/qa_32f_power_aligned16.cc new file mode 100644 index 000000000..1b331daeb --- /dev/null +++ b/volk/lib/qa_32f_power_aligned16.cc @@ -0,0 +1,95 @@ +#include +#include +#include +#include +#include +#include + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifdef LV_HAVE_SSE +void qa_32f_power_aligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 2046; + const int ITERS = 10000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + float* input; + int i; + + float* result_generic; + float* result_sse; + float* result_sse4_1; + + ret = posix_memalign((void**)&input, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_sse4_1, 16, vlen * sizeof(float)); + + random_floats((float*)input, vlen); + + const float power = 3; + + printf("32f_power_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_power_aligned16_manual(result_generic, input, power, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_power_aligned16_manual(result_sse, input, power, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32f_power_aligned16(result_sse4_1, input, power, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1_time: %f\n", total); + + + for(i = 0; i < vlen; i++){ + //printf("%d %e -> %e %e %e\n", i, input[i], result_generic[i], result_sse[i], result_sse4_1[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(result_generic[i], result_sse[i], fabs(result_generic[i])* ERR_DELTA); + CPPUNIT_ASSERT_DOUBLES_EQUAL(result_generic[i], result_sse4_1[i], fabs(result_generic[i])* ERR_DELTA); + } + + free(input); + free(result_generic); + free(result_sse); + +} +#else +void qa_32f_power_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE */ + diff --git a/volk/lib/qa_32f_power_aligned16.h b/volk/lib/qa_32f_power_aligned16.h new file mode 100644 index 000000000..d45df4e56 --- /dev/null +++ b/volk/lib/qa_32f_power_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_POWER_ALIGNED16_H +#define INCLUDED_QA_32F_POWER_ALIGNED16_H + +#include +#include + +class qa_32f_power_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_power_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_POWER_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_sqrt_aligned16.cc b/volk/lib/qa_32f_sqrt_aligned16.cc new file mode 100644 index 000000000..a3e6abc18 --- /dev/null +++ b/volk/lib/qa_32f_sqrt_aligned16.cc @@ -0,0 +1,59 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_sqrt_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_sqrt_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + // No reason to test negative numbers because they result in NaN. + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand()) / static_cast(RAND_MAX)); + } + printf("32f_sqrt_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_sqrt_aligned16_manual(output01, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32f_sqrt_aligned16.h b/volk/lib/qa_32f_sqrt_aligned16.h new file mode 100644 index 000000000..e4b99d981 --- /dev/null +++ b/volk/lib/qa_32f_sqrt_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_SQRT_ALIGNED16_H +#define INCLUDED_QA_32F_SQRT_ALIGNED16_H + +#include +#include + +class qa_32f_sqrt_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_sqrt_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_SQRT_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_stddev_aligned16.cc b/volk/lib/qa_32f_stddev_aligned16.cc new file mode 100644 index 000000000..c0f22cdea --- /dev/null +++ b/volk/lib/qa_32f_stddev_aligned16.cc @@ -0,0 +1,74 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_stddev_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_stddev_aligned16::t1() { + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float stddev_generic; + float stddev_sse; + float stddev_sse4_1; + float mean = 0; + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + mean += input0[i]; + } + mean /= static_cast(vlen); + + printf("32f_stddev_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_stddev_aligned16_manual(&stddev_generic, input0, mean, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_stddev_aligned16_manual(&stddev_sse, input0, mean, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32f_stddev_aligned16(&stddev_sse4_1, input0, mean, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse, fabs(stddev_generic)*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse4_1, fabs(stddev_generic)*1e-4); + +} + +#endif diff --git a/volk/lib/qa_32f_stddev_aligned16.h b/volk/lib/qa_32f_stddev_aligned16.h new file mode 100644 index 000000000..7f8d7a5fc --- /dev/null +++ b/volk/lib/qa_32f_stddev_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_STDDEV_ALIGNED16_H +#define INCLUDED_QA_32F_STDDEV_ALIGNED16_H + +#include +#include + +class qa_32f_stddev_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_stddev_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_STDDEV_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_stddev_and_mean_aligned16.cc b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc new file mode 100644 index 000000000..dcad8bcf3 --- /dev/null +++ b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc @@ -0,0 +1,75 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_stddev_and_mean_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_stddev_and_mean_aligned16::t1() { + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + + float stddev_generic; + float stddev_sse; + float stddev_sse4_1; + float mean_generic; + float mean_sse; + float mean_sse4_1; + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_stddev_and_mean_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_stddev_and_mean_aligned16_manual(&stddev_generic, &mean_generic, input0,vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_stddev_and_mean_aligned16_manual(&stddev_sse, &mean_sse, input0,vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32f_stddev_and_mean_aligned16(&stddev_sse4_1, &mean_sse4_1, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse, fabs(stddev_generic)*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(mean_generic, mean_sse, fabs(mean_generic)*1e-4); + + CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse4_1, fabs(stddev_generic)*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(mean_generic, mean_sse4_1, fabs(mean_generic)*1e-4); + +} + +#endif diff --git a/volk/lib/qa_32f_stddev_and_mean_aligned16.h b/volk/lib/qa_32f_stddev_and_mean_aligned16.h new file mode 100644 index 000000000..e08bd249a --- /dev/null +++ b/volk/lib/qa_32f_stddev_and_mean_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H +#define INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H + +#include +#include + +class qa_32f_stddev_and_mean_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_stddev_and_mean_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_subtract_aligned16.cc b/volk/lib/qa_32f_subtract_aligned16.cc new file mode 100644 index 000000000..a7e1b5ae3 --- /dev/null +++ b/volk/lib/qa_32f_subtract_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_subtract_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_subtract_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_subtract_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_subtract_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_subtract_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32f_subtract_aligned16.h b/volk/lib/qa_32f_subtract_aligned16.h new file mode 100644 index 000000000..97c14f129 --- /dev/null +++ b/volk/lib/qa_32f_subtract_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H +#define INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H + +#include +#include + +class qa_32f_subtract_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_subtract_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_sum_of_poly_aligned16.cc b/volk/lib/qa_32f_sum_of_poly_aligned16.cc new file mode 100644 index 000000000..494776357 --- /dev/null +++ b/volk/lib/qa_32f_sum_of_poly_aligned16.cc @@ -0,0 +1,142 @@ +#include +#include +#include +#include +#include +#include + +#define SNR 30.0 +#define CENTER -4.0 +#define CUTOFF -5.595 +#define ERR_DELTA (1e-4) +#define NUM_ITERS 100000 +#define VEC_LEN 64 +static float uniform() { + return ((float) rand() / RAND_MAX); // uniformly (0, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * -SNR/2.0; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32f_sum_of_poly_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32f_sum_of_poly_aligned16::t1(){ + int i = 0; + + volk_environment_init(); + int ret; + + const int vlen = VEC_LEN; + float cutoff = CUTOFF; + + float* center_point_array; + float* target; + float* target_generic; + float* src0 ; + + + ret = posix_memalign((void**)¢er_point_array, 16, 24); + ret = posix_memalign((void**)&target, 16, 4); + ret = posix_memalign((void**)&target_generic, 16, 4); + ret = posix_memalign((void**)&src0, 16, (vlen << 2)); + + + random_floats((float*)src0, vlen); + + float a = (float)CENTER; + float etoa = expf(a); + center_point_array[0] = (//(5.0 * a * a * a * a)/120.0 + + (-4.0 * a * a * a)/24.0 + + (3.0 * a * a)/6.0 + + (-2.0 * a)/2.0 + + (1.0)) * etoa; + center_point_array[1] = (//(-10.0 * a * a * a)/120.0 + + (6.0 * a * a)/24.0 + + (-3.0 * a)/6.0 + + (1.0/2.0)) * etoa; + center_point_array[2] = (//(10.0 * a * a)/120.0 + + (-4.0 * a)/24.0 + + (1.0/6.0)) * etoa; + center_point_array[3] = (//(-5.0 * a)/120.0 + + (1.0/24.0)) * etoa; + //center_point_array[4] = ((1.0)/120.0) * etoa; + center_point_array[4] = (//(a * a * a * a * a)/120.0 + + (a * a * a * a)/24.0 + + (a * a * a)/-6.0 + + (a * a)/2.0 + + -a + 1.0) * etoa; + + printf("32f_sum_of_poly_aligned16\n"); + + clock_t start, end; + double total; + + float my_sum = 0.0; + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + float sum = 0.0; + for(int l = 0; l < vlen; ++l) { + + sum += expf(src0[l]); + + } + my_sum = sum; + } + + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("exp time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + + volk_32f_sum_of_poly_aligned16_manual(target_generic, src0, center_point_array, &cutoff, vlen << 2, "generic"); + + } + + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32f_sum_of_poly_aligned16_manual(target, src0, center_point_array, &cutoff, vlen << 2, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 approx time: %f\n", total); + + + + printf("exp: %f, sse3: %f\n", my_sum, target[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], fabs(target_generic[0]) * ERR_DELTA); + + + free(center_point_array); + free(target); + free(target_generic); + free(src0); + + +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32f_sum_of_poly_aligned16.h b/volk/lib/qa_32f_sum_of_poly_aligned16.h new file mode 100644 index 000000000..67a347f9a --- /dev/null +++ b/volk/lib/qa_32f_sum_of_poly_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H +#define INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H + +#include +#include + +class qa_32f_sum_of_poly_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_sum_of_poly_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.cc b/volk/lib/qa_32fc_32f_multiply_aligned16.cc new file mode 100644 index 000000000..4eba0a3cd --- /dev/null +++ b/volk/lib/qa_32fc_32f_multiply_aligned16.cc @@ -0,0 +1,85 @@ +#include +#include +#include +#include +#include +#include + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifdef LV_HAVE_SSE3 +void qa_32fc_32f_multiply_aligned16::t1() { + + const int vlen = 2046; + const int ITERS = 100000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + float * taps; + int i; + + std::complex* result_generic; + std::complex* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, vlen * 2 * sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, vlen * 2 * sizeof(float)); + + random_floats((float*)input, vlen * 2); + random_floats(taps, vlen); + + printf("32fc_32f_multiply_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_32f_multiply_aligned16_manual(result_generic, input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_32f_multiply_aligned16_manual(result_sse3, input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} +#else +void qa_32fc_32f_multiply_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE3 */ + diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.h b/volk/lib/qa_32fc_32f_multiply_aligned16.h new file mode 100644 index 000000000..fc3b3eeb2 --- /dev/null +++ b/volk/lib/qa_32fc_32f_multiply_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H +#define INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H + +#include +#include + +class qa_32fc_32f_multiply_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_32f_multiply_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc b/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc new file mode 100644 index 000000000..64ea65da9 --- /dev/null +++ b/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc @@ -0,0 +1,83 @@ +#include +#include +#include +#include +#include +#include + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1.5e-3) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifdef LV_HAVE_SSE +void qa_32fc_32f_power_32fc_aligned16::t1() { + + const int vlen = 2046; + const int ITERS = 10000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + int i; + + std::complex* result_generic; + std::complex* result_sse; + + ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, vlen * 2 * sizeof(float)); + ret = posix_memalign((void**)&result_sse, 16, vlen * 2 * sizeof(float)); + + random_floats((float*)input, vlen * 2); + + const float power = 3.2; + + printf("32fc_32f_power_32fc_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_32f_power_32fc_aligned16_manual(result_generic, input, power, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_32f_power_32fc_aligned16_manual(result_sse, input, power, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + assertcomplexEqual(result_generic[i], result_sse[i], ERR_DELTA); + } + + free(input); + free(result_generic); + free(result_sse); + +} +#else +void qa_32fc_32f_power_32fc_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE */ + diff --git a/volk/lib/qa_32fc_32f_power_32fc_aligned16.h b/volk/lib/qa_32fc_32f_power_32fc_aligned16.h new file mode 100644 index 000000000..464b7b7cc --- /dev/null +++ b/volk/lib/qa_32fc_32f_power_32fc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H +#define INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H + +#include +#include + +class qa_32fc_32f_power_32fc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_32f_power_32fc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_atan2_32f_aligned16.cc b/volk/lib/qa_32fc_atan2_32f_aligned16.cc new file mode 100644 index 000000000..a24382d71 --- /dev/null +++ b/volk/lib/qa_32fc_atan2_32f_aligned16.cc @@ -0,0 +1,75 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32fc_atan2_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32fc_atan2_32f_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_atan2_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_atan2_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_atan2_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32fc_atan2_32f_aligned16(output_sse4_1, input0, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_atan2_32f_aligned16.h b/volk/lib/qa_32fc_atan2_32f_aligned16.h new file mode 100644 index 000000000..9c4dc209a --- /dev/null +++ b/volk/lib/qa_32fc_atan2_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H + +#include +#include + +class qa_32fc_atan2_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_atan2_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc new file mode 100644 index 000000000..497914e0a --- /dev/null +++ b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc @@ -0,0 +1,137 @@ +#include +#include +#include +#include + + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +//test for sse + +#if LV_HAVE_SSE && LV_HAVE_64 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform () * 32767; +} + + +void qa_32fc_conjugate_dot_prod_aligned16::t1() { + const int vlen = 789743; + + volk_environment_init(); + int ret; + + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result, 16, 8); + + + result_generic[0] = std::complex(0,0); + result[0] = std::complex(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + + + volk_32fc_conjugate_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + + + volk_32fc_conjugate_dot_prod_aligned16_manual(result, input, taps, vlen * 8, "sse"); + + printf("32fc_conjugate_dot_prod_aligned16\n"); + printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); + + assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result); + +} + + +#elif LV_HAVE_SSE && LV_HAVE_32 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform () * 32767; +} + + +void qa_32fc_conjugate_dot_prod_aligned16::t1() { + const int vlen = 789743; + + volk_environment_init(); + int ret; + + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result, 16, 8); + + + result_generic[0] = std::complex(0,0); + result[0] = std::complex(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + + + volk_32fc_conjugate_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + + + volk_32fc_conjugate_dot_prod_aligned16_manual(result, input, taps, vlen * 8, "sse_32"); + + printf("32fc_conjugate_dot_prod_aligned16\n"); + printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); + + assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result); + +} + + +#else + +void qa_32fc_conjugate_dot_prod_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h new file mode 100644 index 000000000..507b1769b --- /dev/null +++ b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H +#define INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H + +#include +#include + +class qa_32fc_conjugate_dot_prod_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_conjugate_dot_prod_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc new file mode 100644 index 000000000..0f5a030f5 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32fc_deinterleave_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_generic1[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse1[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_deinterleave_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], fabs(output_generic1[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_32f_aligned16.h b/volk/lib/qa_32fc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..78660e6ad --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H + +#include +#include + +class qa_32fc_deinterleave_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc new file mode 100644 index 000000000..6e051afbc --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32fc_deinterleave_64f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_64f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + double output_generic[vlen] __attribute__ ((aligned (16))); + double output_generic1[vlen] __attribute__ ((aligned (16))); + double output_sse2[vlen] __attribute__ ((aligned (16))); + double output_sse21[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_deinterleave_64f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_64f_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_64f_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse21[i], fabs(output_generic1[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_64f_aligned16.h b/volk/lib/qa_32fc_deinterleave_64f_aligned16.h new file mode 100644 index 000000000..f924b9752 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_64f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H + +#include +#include + +class qa_32fc_deinterleave_64f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_64f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc new file mode 100644 index 000000000..850518524 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32fc_deinterleave_real_16s_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_real_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_deinterleave_real_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..68b80f27d --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H + +#include +#include + +class qa_32fc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc new file mode 100644 index 000000000..321deb184 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32fc_deinterleave_real_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_real_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_32f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_32f_aligned16_manual(output_sse, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..765450bb6 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H + +#include +#include + +class qa_32fc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc new file mode 100644 index 000000000..aedb2e387 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_32fc_deinterleave_real_64f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32fc_deinterleave_real_64f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + double output_generic[vlen] __attribute__ ((aligned (16))); + double output_sse2[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_deinterleave_real_64f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_64f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_deinterleave_real_64f_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h new file mode 100644 index 000000000..3e55fb812 --- /dev/null +++ b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H +#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H + +#include +#include + +class qa_32fc_deinterleave_real_64f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_64f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_dot_prod_aligned16.cc b/volk/lib/qa_32fc_dot_prod_aligned16.cc new file mode 100644 index 000000000..bcf9ea954 --- /dev/null +++ b/volk/lib/qa_32fc_dot_prod_aligned16.cc @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include +#include + + + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + + + +#if LV_HAVE_SSE3 +void qa_32fc_dot_prod_aligned16::t1() { + + const int vlen = 2046; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result_sse3, 16, 8); + + + result_generic[0] = std::complex(0,0); + result_sse3[0] = std::complex(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + printf("32fc_dot_prod_aligned16\n"); + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse3"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + printf("generic: %f +i%f ... sse3: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0])); + + + assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} + +#else +void qa_32fc_dot_prod_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 +void qa_32fc_dot_prod_aligned16::t2() { + + const int vlen = 2046; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result_sse3, 16, 8); + + + result_generic[0] = std::complex(0,0); + result_sse3[0] = std::complex(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + printf("32fc_dot_prod_aligned16\n"); + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse_32"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_32_time: %f\n", total); + + printf("generic: %f +i%f ... sse_32: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0])); + + + assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} + +#else +void qa_32fc_dot_prod_aligned16::t2() { + printf("sse_32 not available... no test performed\n"); +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_64 + +void qa_32fc_dot_prod_aligned16::t3() { + + const int vlen = 2046; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result_sse3, 16, 8); + + + result_generic[0] = std::complex(0,0); + result_sse3[0] = std::complex(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + printf("32fc_dot_prod_aligned16\n"); + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse_64"); + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_64_time: %f\n", total); + + printf("generic: %f +i%f ... sse_64: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0])); + + + assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} + +#else +void qa_32fc_dot_prod_aligned16::t3() { + printf("sse_64 not available... no test performed\n"); +} + + + +#endif diff --git a/volk/lib/qa_32fc_dot_prod_aligned16.h b/volk/lib/qa_32fc_dot_prod_aligned16.h new file mode 100644 index 000000000..4b360db27 --- /dev/null +++ b/volk/lib/qa_32fc_dot_prod_aligned16.h @@ -0,0 +1,20 @@ +#ifndef INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H +#define INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H + +#include +#include + +class qa_32fc_dot_prod_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_dot_prod_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); + void t2 (); + void t3 (); +}; + + +#endif /* INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_index_max_aligned16.cc b/volk/lib/qa_32fc_index_max_aligned16.cc new file mode 100644 index 000000000..4d83f1639 --- /dev/null +++ b/volk/lib/qa_32fc_index_max_aligned16.cc @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 1000000 +#define VEC_LEN 3096 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_index_max_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32fc_index_max_aligned16::t1(){ + + const int vlen = VEC_LEN; + + volk_environment_init(); + int ret; + + unsigned int* target; + unsigned int* target_generic; + std::complex* src0 ; + + + unsigned int i_target; + target = &i_target; + unsigned int i_target_generic; + target_generic = &i_target_generic; + ret = posix_memalign((void**)&src0, 16, vlen << 3); + + random_floats((float*)src0, vlen * 2); + + printf("32fc_index_max_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 time: %f\n", total); + + + + + printf("generic: %u, sse3: %u\n", target_generic[0], target[0]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1); + + + + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32fc_index_max_aligned16.h b/volk/lib/qa_32fc_index_max_aligned16.h new file mode 100644 index 000000000..0990bcb1f --- /dev/null +++ b/volk/lib/qa_32fc_index_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H +#define INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H + +#include +#include + +class qa_32fc_index_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_index_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc new file mode 100644 index 000000000..a4be1616b --- /dev/null +++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_magnitude_16s_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_magnitude_16s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse[vlen] __attribute__ ((aligned (16))); + int16_t output_sse3[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_magnitude_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_16s_aligned16_manual(output_sse3, input0, 32768.0, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1); + } +} + +#endif diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.h b/volk/lib/qa_32fc_magnitude_16s_aligned16.h new file mode 100644 index 000000000..ffdf1dd9e --- /dev/null +++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H +#define INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H + +#include +#include + +class qa_32fc_magnitude_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_magnitude_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc new file mode 100644 index 000000000..d69ada408 --- /dev/null +++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc @@ -0,0 +1,70 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_magnitude_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_magnitude_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse3[vlen] __attribute__ ((aligned (16))); + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_magnitude_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_32f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_32f_aligned16_manual(output_sse, input0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_32f_aligned16_manual(output_sse3, input0, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4); + } +} + +#endif diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.h b/volk/lib/qa_32fc_magnitude_32f_aligned16.h new file mode 100644 index 000000000..a2881308c --- /dev/null +++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H + +#include +#include + +class qa_32fc_magnitude_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_magnitude_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_multiply_aligned16.cc b/volk/lib/qa_32fc_multiply_aligned16.cc new file mode 100644 index 000000000..e1f7eab3d --- /dev/null +++ b/volk/lib/qa_32fc_multiply_aligned16.cc @@ -0,0 +1,86 @@ +#include +#include +#include +#include +#include +#include + + + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-3) + +//test for sse +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +#ifdef LV_HAVE_SSE3 +void qa_32fc_multiply_aligned16::t1() { + + const int vlen = 2046; + const int ITERS = 100000; + + int i; + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result_sse3; + + ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(float)); + ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(float)); + ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float)); + ret = posix_memalign((void**)&result_sse3, 16, vlen*2*sizeof(float)); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + printf("32fc_multiply_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_multiply_aligned16_manual(result_generic, input, taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_multiply_aligned16_manual(result_sse3, input, taps, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse3); + +} +#else +void qa_32fc_multiply_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#endif /* LV_HAVE_SSE3 */ diff --git a/volk/lib/qa_32fc_multiply_aligned16.h b/volk/lib/qa_32fc_multiply_aligned16.h new file mode 100644 index 000000000..c8abaa8fe --- /dev/null +++ b/volk/lib/qa_32fc_multiply_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H +#define INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H + +#include +#include + +class qa_32fc_multiply_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_multiply_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc new file mode 100644 index 000000000..83cdf4b15 --- /dev/null +++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +//test for sse3 + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_power_spectral_density_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_power_spectral_density_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse3[vlen] __attribute__ ((aligned (16))); + + const float scalar = vlen; + const float rbw = 1.7; + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_power_spectral_density_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectral_density_32f_aligned16_manual(output_generic, input0, scalar, rbw, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectral_density_32f_aligned16_manual(output_sse3, input0, scalar, rbw, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4)); + } +} + +#endif diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h new file mode 100644 index 000000000..26f430bec --- /dev/null +++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H + +#include +#include + +class qa_32fc_power_spectral_density_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_power_spectral_density_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc new file mode 100644 index 000000000..4d1359068 --- /dev/null +++ b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +//test for sse3 + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_power_spectrum_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_power_spectrum_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse3[vlen] __attribute__ ((aligned (16))); + + const float scalar = vlen; + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + + printf("32fc_power_spectrum_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectrum_32f_aligned16_manual(output_generic, input0, scalar, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_power_spectrum_32f_aligned16_manual(output_sse3, input0, scalar, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse33... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4)); + } +} + +#endif diff --git a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h new file mode 100644 index 000000000..d991223f3 --- /dev/null +++ b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H + +#include +#include + +class qa_32fc_power_spectrum_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_power_spectrum_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_square_dist_aligned16.cc b/volk/lib/qa_32fc_square_dist_aligned16.cc new file mode 100644 index 000000000..d9ead8495 --- /dev/null +++ b/volk/lib/qa_32fc_square_dist_aligned16.cc @@ -0,0 +1,91 @@ +#include +#include +#include +#include +#include + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 10000000 +#define VEC_LEN 64 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_square_dist_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32fc_square_dist_aligned16::t1(){ + int i = 0; + + const int vlen = VEC_LEN; + volk_environment_init(); + int ret; + + float* target; + float* target_generic; + std::complex* src0 ; + std::complex* points; + + ret = posix_memalign((void**)&points, 16, vlen << 3); + ret = posix_memalign((void**)&target, 16, vlen << 2); + ret = posix_memalign((void**)&target_generic, 16, vlen << 2); + ret = posix_memalign((void**)&src0, 16, 8); + + random_floats((float*)points, vlen * 2); + random_floats((float*)src0, 2); + + printf("32fc_square_dist_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_square_dist_aligned16_manual(target_generic, src0, points, vlen << 3, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_square_dist_aligned16_manual(target, src0, points, vlen << 3, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 time: %f\n", total); + + + + for(; i < vlen; ++i) { + //printf("generic: %f, sse3: %f\n", target_generic[i], target[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[i], target[i], fabs(target_generic[i]) * ERR_DELTA); + } + + free(target); + free(target_generic); + free(points); + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32fc_square_dist_aligned16.h b/volk/lib/qa_32fc_square_dist_aligned16.h new file mode 100644 index 000000000..9d365d8b0 --- /dev/null +++ b/volk/lib/qa_32fc_square_dist_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H +#define INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H + +#include +#include + +class qa_32fc_square_dist_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_square_dist_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc new file mode 100644 index 000000000..f923d1d5c --- /dev/null +++ b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc @@ -0,0 +1,96 @@ +#include +#include +#include +#include +#include +#include + +#define ERR_DELTA .0001 +#define NUM_ITERS 10000000 +#define VEC_LEN 64 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_square_dist_scalar_mult_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32fc_square_dist_scalar_mult_aligned16::t1(){ + int i = 0; + + const int vlen = VEC_LEN; + + volk_environment_init(); + int ret; + + float* target; + float* target_generic; + std::complex* src0 ; + std::complex* points; + float scalar; + + ret = posix_memalign((void**)&points, 16, vlen << 3); + ret = posix_memalign((void**)&target, 16, vlen << 2); + ret = posix_memalign((void**)&target_generic, 16, vlen << 2); + ret = posix_memalign((void**)&src0, 16, 8); + + random_floats((float*)points, vlen * 2); + random_floats((float*)src0, 2); + random_floats(&scalar, 1); + + printf("32fc_square_dist_scalar_mult_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_square_dist_scalar_mult_aligned16_manual(target_generic, src0, points, scalar, vlen << 3, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_32fc_square_dist_scalar_mult_aligned16_manual(target, src0, points, scalar, vlen << 3, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 time: %f\n", total); + + + + for(i = 0; i < vlen; ++i) { + printf("generic: %f, sse3: %f\n", target_generic[i], target[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target[i], target_generic[i], fabs(target_generic[1]) * ERR_DELTA);//, target_generic[1] * ERR_DELTA); + } + + free(target); + free(target_generic); + free(points); + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h new file mode 100644 index 000000000..ac4e3c45b --- /dev/null +++ b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H +#define INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H + +#include +#include + +class qa_32fc_square_dist_scalar_mult_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_square_dist_scalar_mult_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H */ diff --git a/volk/lib/qa_32s_and_aligned16.cc b/volk/lib/qa_32s_and_aligned16.cc new file mode 100644 index 000000000..72d05cf6f --- /dev/null +++ b/volk/lib/qa_32s_and_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32s_and_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32s_and_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int32_t input0[vlen] __attribute__ ((aligned (16))); + int32_t input1[vlen] __attribute__ ((aligned (16))); + + int32_t output0[vlen] __attribute__ ((aligned (16))); + int32_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int32_t) (rand() - (RAND_MAX/2))); + input1[i] = ((int32_t) (rand() - (RAND_MAX/2))); + } + printf("32s_and_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_and_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_and_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32s_and_aligned16.h b/volk/lib/qa_32s_and_aligned16.h new file mode 100644 index 000000000..dfcb47c63 --- /dev/null +++ b/volk/lib/qa_32s_and_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32S_AND_ALIGNED16_H +#define INCLUDED_QA_32S_AND_ALIGNED16_H + +#include +#include + +class qa_32s_and_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32s_and_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32S_AND_ALIGNED16_H */ diff --git a/volk/lib/qa_32s_convert_32f_aligned16.cc b/volk/lib/qa_32s_convert_32f_aligned16.cc new file mode 100644 index 000000000..eab3fe016 --- /dev/null +++ b/volk/lib/qa_32s_convert_32f_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32s_convert_32f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32s_convert_32f_aligned16::t1() { + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + + int32_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int32_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); + } + printf("32s_convert_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_convert_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_convert_32f_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_32s_convert_32f_aligned16.h b/volk/lib/qa_32s_convert_32f_aligned16.h new file mode 100644 index 000000000..efd2a2eea --- /dev/null +++ b/volk/lib/qa_32s_convert_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H +#define INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H + +#include +#include + +class qa_32s_convert_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32s_convert_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32s_convert_32f_unaligned16.cc b/volk/lib/qa_32s_convert_32f_unaligned16.cc new file mode 100644 index 000000000..0e504cfa1 --- /dev/null +++ b/volk/lib/qa_32s_convert_32f_unaligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_32s_convert_32f_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32s_convert_32f_unaligned16::t1() { + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + + int32_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int32_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); + } + printf("32s_convert_32f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_convert_32f_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_convert_32f_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_32s_convert_32f_unaligned16.h b/volk/lib/qa_32s_convert_32f_unaligned16.h new file mode 100644 index 000000000..5006f5fd8 --- /dev/null +++ b/volk/lib/qa_32s_convert_32f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H +#define INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H + +#include +#include + +class qa_32s_convert_32f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32s_convert_32f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_32s_or_aligned16.cc b/volk/lib/qa_32s_or_aligned16.cc new file mode 100644 index 000000000..e09dfb91c --- /dev/null +++ b/volk/lib/qa_32s_or_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32s_or_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32s_or_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int32_t input0[vlen] __attribute__ ((aligned (16))); + int32_t input1[vlen] __attribute__ ((aligned (16))); + + int32_t output0[vlen] __attribute__ ((aligned (16))); + int32_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int32_t) (rand() - (RAND_MAX/2))); + input1[i] = ((int32_t) (rand() - (RAND_MAX/2))); + } + printf("32s_or_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_or_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_or_aligned16_manual(output01, input0, input1, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32s_or_aligned16.h b/volk/lib/qa_32s_or_aligned16.h new file mode 100644 index 000000000..9e949eb52 --- /dev/null +++ b/volk/lib/qa_32s_or_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32S_OR_ALIGNED16_H +#define INCLUDED_QA_32S_OR_ALIGNED16_H + +#include +#include + +class qa_32s_or_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32s_or_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32S_OR_ALIGNED16_H */ diff --git a/volk/lib/qa_32u_byteswap_aligned16.cc b/volk/lib/qa_32u_byteswap_aligned16.cc new file mode 100644 index 000000000..8b1023876 --- /dev/null +++ b/volk/lib/qa_32u_byteswap_aligned16.cc @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_32u_byteswap_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_32u_byteswap_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100001; + + uint32_t output0[vlen] __attribute__ ((aligned (16))); + uint32_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + output0[i] = (uint32_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2)); + } + memcpy(output01, output0, vlen*sizeof(uint32_t)); + printf("32u_byteswap_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32u_byteswap_aligned16_manual(output0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32u_byteswap_aligned16_manual(output01, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_32u_byteswap_aligned16.h b/volk/lib/qa_32u_byteswap_aligned16.h new file mode 100644 index 000000000..47bad4c3d --- /dev/null +++ b/volk/lib/qa_32u_byteswap_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H +#define INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H + +#include +#include + +class qa_32u_byteswap_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32u_byteswap_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H */ diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc new file mode 100644 index 000000000..49fcddeb2 --- /dev/null +++ b/volk/lib/qa_32u_popcnt_aligned16.cc @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE4_2 + +void qa_32u_popcnt_aligned16::t1() { + printf("sse4.2 not available... no test performed\n"); +} + +#else + +void qa_32u_popcnt_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + + const int ITERS = 10000000; + uint32_t input0 __attribute__ ((aligned (16))); + + uint32_t output0 __attribute__ ((aligned (16))); + uint32_t output01 __attribute__ ((aligned (16))); + + input0 = ((uint32_t) (rand() - (RAND_MAX/2))); + output0 = 0; + output01 = 0; + + printf("32u_popcnt_aligned\n"); + + start = clock(); + uint32_t ret = 0; + for(int count = 0; count < ITERS; ++count) { + volk_32u_popcnt_aligned16_manual(&ret, input0, "generic"); + output0 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + ret = 0; + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_32u_popcnt_aligned16(&ret, input0); + output01 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.2_time: %f\n", total); + + + CPPUNIT_ASSERT_EQUAL(output0, output01); +} + +#endif diff --git a/volk/lib/qa_32u_popcnt_aligned16.h b/volk/lib/qa_32u_popcnt_aligned16.h new file mode 100644 index 000000000..fa1dc1041 --- /dev/null +++ b/volk/lib/qa_32u_popcnt_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32U_POPCNT_ALIGNED16_H +#define INCLUDED_QA_32U_POPCNT_ALIGNED16_H + +#include +#include + +class qa_32u_popcnt_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32u_popcnt_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32U_POPCNT_ALIGNED16_H */ diff --git a/volk/lib/qa_64f_convert_32f_aligned16.cc b/volk/lib/qa_64f_convert_32f_aligned16.cc new file mode 100644 index 000000000..0eaebf00a --- /dev/null +++ b/volk/lib/qa_64f_convert_32f_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_64f_convert_32f_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64f_convert_32f_aligned16::t1() { + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + + double input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("64f_convert_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_convert_32f_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_convert_32f_aligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_64f_convert_32f_aligned16.h b/volk/lib/qa_64f_convert_32f_aligned16.h new file mode 100644 index 000000000..95d79f73d --- /dev/null +++ b/volk/lib/qa_64f_convert_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H +#define INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H + +#include +#include + +class qa_64f_convert_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64f_convert_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_64f_convert_32f_unaligned16.cc b/volk/lib/qa_64f_convert_32f_unaligned16.cc new file mode 100644 index 000000000..dcf94bd27 --- /dev/null +++ b/volk/lib/qa_64f_convert_32f_unaligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_64f_convert_32f_unaligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64f_convert_32f_unaligned16::t1() { + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + + double input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse2[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("64f_convert_32f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_convert_32f_unaligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_convert_32f_unaligned16_manual(output_sse2, input0, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); + } +} + +#endif diff --git a/volk/lib/qa_64f_convert_32f_unaligned16.h b/volk/lib/qa_64f_convert_32f_unaligned16.h new file mode 100644 index 000000000..430327e81 --- /dev/null +++ b/volk/lib/qa_64f_convert_32f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H +#define INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H + +#include +#include + +class qa_64f_convert_32f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64f_convert_32f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_64f_max_aligned16.cc b/volk/lib/qa_64f_max_aligned16.cc new file mode 100644 index 000000000..41ab078b0 --- /dev/null +++ b/volk/lib/qa_64f_max_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_64f_max_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64f_max_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + double input0[vlen] __attribute__ ((aligned (16))); + double input1[vlen] __attribute__ ((aligned (16))); + + double output0[vlen] __attribute__ ((aligned (16))); + double output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("64f_max_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_max_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_max_aligned16_manual(output01, input0, input1, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_64f_max_aligned16.h b/volk/lib/qa_64f_max_aligned16.h new file mode 100644 index 000000000..7cbd4d4c1 --- /dev/null +++ b/volk/lib/qa_64f_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64F_MAX_ALIGNED16_H +#define INCLUDED_QA_64F_MAX_ALIGNED16_H + +#include +#include + +class qa_64f_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64f_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64F_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_64f_min_aligned16.cc b/volk/lib/qa_64f_min_aligned16.cc new file mode 100644 index 000000000..b4664d065 --- /dev/null +++ b/volk/lib/qa_64f_min_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_64f_min_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64f_min_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + double input0[vlen] __attribute__ ((aligned (16))); + double input1[vlen] __attribute__ ((aligned (16))); + + double output0[vlen] __attribute__ ((aligned (16))); + double output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("64f_min_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_min_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64f_min_aligned16_manual(output01, input0, input1, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_64f_min_aligned16.h b/volk/lib/qa_64f_min_aligned16.h new file mode 100644 index 000000000..a0e95395f --- /dev/null +++ b/volk/lib/qa_64f_min_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64F_MIN_ALIGNED16_H +#define INCLUDED_QA_64F_MIN_ALIGNED16_H + +#include +#include + +class qa_64f_min_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64f_min_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64F_MIN_ALIGNED16_H */ diff --git a/volk/lib/qa_64u_byteswap_aligned16.cc b/volk/lib/qa_64u_byteswap_aligned16.cc new file mode 100644 index 000000000..4f5d4d02b --- /dev/null +++ b/volk/lib/qa_64u_byteswap_aligned16.cc @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE2 + +void qa_64u_byteswap_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_64u_byteswap_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100001; + + uint64_t output0[vlen] __attribute__ ((aligned (16))); + uint64_t output01[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + output0[i] = (uint64_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2)); + } + memcpy(output01, output0, vlen*sizeof(uint64_t)); + printf("64u_byteswap_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64u_byteswap_aligned16_manual(output0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_64u_byteswap_aligned16_manual(output01, vlen, "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + } +} + +#endif diff --git a/volk/lib/qa_64u_byteswap_aligned16.h b/volk/lib/qa_64u_byteswap_aligned16.h new file mode 100644 index 000000000..a4fa0c983 --- /dev/null +++ b/volk/lib/qa_64u_byteswap_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H +#define INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H + +#include +#include + +class qa_64u_byteswap_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64u_byteswap_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H */ diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc new file mode 100644 index 000000000..bce9ff6c2 --- /dev/null +++ b/volk/lib/qa_64u_popcnt_aligned16.cc @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE4_2 + +void qa_64u_popcnt_aligned16::t1() { + printf("sse4.2 not available... no test performed\n"); +} + +#else + +void qa_64u_popcnt_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + + const int ITERS = 10000000; + uint64_t input0 __attribute__ ((aligned (16))); + + uint64_t output0 __attribute__ ((aligned (16))); + uint64_t output01 __attribute__ ((aligned (16))); + + input0 = ((uint64_t) (rand() - (RAND_MAX/2))); + output0 = 0; + output01 = 0; + + printf("64u_popcnt_aligned\n"); + + start = clock(); + uint64_t ret = 0; + for(int count = 0; count < ITERS; ++count) { + volk_64u_popcnt_aligned16_manual(&ret, input0, "generic"); + output0 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + ret = 0; + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_64u_popcnt_aligned16(&ret, input0); + output01 += ret; + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.2_time: %f\n", total); + + + CPPUNIT_ASSERT_EQUAL(output0, output01); +} + +#endif diff --git a/volk/lib/qa_64u_popcnt_aligned16.h b/volk/lib/qa_64u_popcnt_aligned16.h new file mode 100644 index 000000000..217822d6e --- /dev/null +++ b/volk/lib/qa_64u_popcnt_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_64U_POPCNT_ALIGNED16_H +#define INCLUDED_QA_64U_POPCNT_ALIGNED16_H + +#include +#include + +class qa_64u_popcnt_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_64u_popcnt_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_64U_POPCNT_ALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_16s_aligned16.cc b/volk/lib/qa_8s_convert_16s_aligned16.cc new file mode 100644 index 000000000..35f08fb81 --- /dev/null +++ b/volk/lib/qa_8s_convert_16s_aligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include + +//test for sse4_1 + +#ifndef LV_HAVE_SSE4_1 + +void qa_8s_convert_16s_aligned16::t1() { + printf("sse4.1 not available... no test performed\n"); +} + +#else + +void qa_8s_convert_16s_aligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int8_t input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 128.0)); + } + printf("8s_convert_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8s_convert_16s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8s_convert_16s_aligned16(output_sse4_1, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8s_convert_16s_aligned16.h b/volk/lib/qa_8s_convert_16s_aligned16.h new file mode 100644 index 000000000..38739fc96 --- /dev/null +++ b/volk/lib/qa_8s_convert_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H +#define INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H + +#include +#include + +class qa_8s_convert_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8s_convert_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_16s_unaligned16.cc b/volk/lib/qa_8s_convert_16s_unaligned16.cc new file mode 100644 index 000000000..bb326f818 --- /dev/null +++ b/volk/lib/qa_8s_convert_16s_unaligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include + +//test for sse4_1 + +#ifndef LV_HAVE_SSE4_1 + +void qa_8s_convert_16s_unaligned16::t1() { + printf("sse4.1 not available... no test performed\n"); +} + +#else + +void qa_8s_convert_16s_unaligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int8_t input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 128.0)); + } + printf("8s_convert_16s_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8s_convert_16s_unaligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8s_convert_16s_unaligned16(output_sse4_1, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8s_convert_16s_unaligned16.h b/volk/lib/qa_8s_convert_16s_unaligned16.h new file mode 100644 index 000000000..d39fffc35 --- /dev/null +++ b/volk/lib/qa_8s_convert_16s_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H +#define INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H + +#include +#include + +class qa_8s_convert_16s_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8s_convert_16s_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_32f_aligned16.cc b/volk/lib/qa_8s_convert_32f_aligned16.cc new file mode 100644 index 000000000..522da0b9d --- /dev/null +++ b/volk/lib/qa_8s_convert_32f_aligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include + +//test for sse4.1 + +#ifndef LV_HAVE_SSE4_1 + +void qa_8s_convert_32f_aligned16::t1() { + printf("sse4_1 not available... no test performed\n"); +} + +#else + +void qa_8s_convert_32f_aligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int8_t input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 128.0)); + } + printf("8s_convert_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8s_convert_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8s_convert_32f_aligned16(output_sse4_1, input0, 128.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8s_convert_32f_aligned16.h b/volk/lib/qa_8s_convert_32f_aligned16.h new file mode 100644 index 000000000..7f8401d42 --- /dev/null +++ b/volk/lib/qa_8s_convert_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H +#define INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H + +#include +#include + +class qa_8s_convert_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8s_convert_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_32f_unaligned16.cc b/volk/lib/qa_8s_convert_32f_unaligned16.cc new file mode 100644 index 000000000..ea1fb7c74 --- /dev/null +++ b/volk/lib/qa_8s_convert_32f_unaligned16.cc @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include + +//test for sse4.1 + +#ifndef LV_HAVE_SSE4_1 + +void qa_8s_convert_32f_unaligned16::t1() { + printf("sse4_1 not available... no test performed\n"); +} + +#else + +void qa_8s_convert_32f_unaligned16::t1() { + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + int8_t input0[vlen+1] __attribute__ ((aligned (16))); + + float output_generic[vlen+1] __attribute__ ((aligned (16))); + float output_sse4_1[vlen+1] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 128.0)); + } + printf("8s_convert_32f_unaligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8s_convert_32f_unaligned16_manual(output_generic, &input0[1], 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8s_convert_32f_unaligned16(output_sse4_1, &input0[1], 128.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%e...%e\n", output_generic[i], output_sse4_1[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8s_convert_32f_unaligned16.h b/volk/lib/qa_8s_convert_32f_unaligned16.h new file mode 100644 index 000000000..aad2f8c22 --- /dev/null +++ b/volk/lib/qa_8s_convert_32f_unaligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H +#define INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H + +#include +#include + +class qa_8s_convert_32f_unaligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8s_convert_32f_unaligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc new file mode 100644 index 000000000..823e7fe2e --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc @@ -0,0 +1,67 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +void qa_8sc_deinterleave_16s_aligned16::t1() { + printf("sse4_1 not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_16s_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_generic1[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_11[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_16s_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_deinterleave_16s_aligned16(output_sse4_1, output_sse4_11, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_sse4_11[i]); + } +} + +#endif diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.h b/volk/lib/qa_8sc_deinterleave_16s_aligned16.h new file mode 100644 index 000000000..9c99fed70 --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H + +#include +#include + +class qa_8sc_deinterleave_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc new file mode 100644 index 000000000..fb580516c --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc @@ -0,0 +1,134 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +#ifndef LV_HAVE_SSE + +void qa_8sc_deinterleave_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_32f_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_generic1[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse1[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, 128.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], fabs(output_generic[i])*1e-4); + } +} + +#endif /* LV_HAVE_SSE */ + +#else + +void qa_8sc_deinterleave_32f_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_generic1[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + float output_sse1[vlen] __attribute__ ((aligned (16))); + float output_sse4_1[vlen] __attribute__ ((aligned (16))); + float output_sse14_1[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, 128.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_deinterleave_32f_aligned16(output_sse4_1, output_sse14_1, input0, 128.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1_time: %f\n", total); + + for(int i = 0; i < vlen; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("%d generic... %e %e, sse... %e %e sse4.1... %e %e\n", i, output_generic[i], output_generic1[i], output_sse[i], output_sse1[i], output_sse4_1[i], output_sse14_1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i],std::max((output_generic[i])*1e-4, 1e-4)); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], std::max((output_generic[i])*1e-4, 1e-4)); + + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], std::max((output_generic[i])*1e-4, 1e-4)); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse14_1[i], std::max((output_generic[i])*1e-4, 1e-4)); + } +} + + +#endif /* LV_HAVE_SSE4_1 */ diff --git a/volk/lib/qa_8sc_deinterleave_32f_aligned16.h b/volk/lib/qa_8sc_deinterleave_32f_aligned16.h new file mode 100644 index 000000000..63b5fdadb --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H + +#include +#include + +class qa_8sc_deinterleave_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc new file mode 100644 index 000000000..1cc844b52 --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc @@ -0,0 +1,64 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +void qa_8sc_deinterleave_real_16s_aligned16::t1() { + printf("sse4_1 not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_real_16s_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_real_16s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_16s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_deinterleave_real_16s_aligned16(output_sse4_1, input0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); + } +} + +#endif diff --git a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h new file mode 100644 index 000000000..02050926f --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H + +#include +#include + +class qa_8sc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_16s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc new file mode 100644 index 000000000..10e537cde --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc @@ -0,0 +1,138 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE4_1 + +#ifndef LV_HAVE_SSE + +void qa_8sc_deinterleave_real_32f_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_real_32f_aligned16::t1() { + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_sse[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + } +} + +#endif /* LV_HAVE_SSE */ + +#else + +void qa_8sc_deinterleave_real_32f_aligned16::t1() { + + + volk_runtime_init(); + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex *input0; + + float* output_generic; + float* output_sse; + float* output_sse4_1; + + ret = posix_memalign((void**)&input0, 16, 2*vlen * sizeof(int8_t)); + ret = posix_memalign((void**)&output_generic, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&output_sse, 16, vlen * sizeof(float)); + ret = posix_memalign((void**)&output_sse4_1, 16, vlen * sizeof(float)); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0); + } + + printf("8sc_deinterleave_real_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 1288.0, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_deinterleave_real_32f_aligned16(output_sse4_1, input0, 128.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4); + } + + free(input0); + free(output_generic); + free(output_sse); + free(output_sse4_1); +} + +#endif /* LV_HAVE_SSE4_1 */ diff --git a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h new file mode 100644 index 000000000..93338e488 --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H + +#include +#include + +class qa_8sc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc new file mode 100644 index 000000000..d84df8119 --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSSE3 + +void qa_8sc_deinterleave_real_8s_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_8sc_deinterleave_real_8s_aligned16::t1() { + + volk_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 100000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + int8_t output_generic[vlen] __attribute__ ((aligned (16))); + int8_t output_ssse3[vlen] __attribute__ ((aligned (16))); + + int8_t* loadInput = (int8_t*)input0; + for(int i = 0; i < vlen*2; ++i) { + loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + printf("8sc_deinterleave_real_8s_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_8s_aligned16_manual(output_generic, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("ssse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]); + } +} + +#endif diff --git a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h new file mode 100644 index 000000000..92fc0dd4a --- /dev/null +++ b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H +#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H + +#include +#include + +class qa_8sc_deinterleave_real_8s_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_8s_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc new file mode 100644 index 000000000..d64eac8ce --- /dev/null +++ b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc @@ -0,0 +1,87 @@ +#include +#include +#include +#include +#include +#include + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +#ifndef LV_HAVE_SSE4_1 + +void qa_8sc_multiply_conjugate_16sc_aligned16::t1() { + printf("sse4.1 not available... no test performed\n"); +} + +#else + +void qa_8sc_multiply_conjugate_16sc_aligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 2046; + const int ITERS = 100000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result_sse4_1; + int i; + int8_t* inputInt8_T; + int8_t* tapsInt8_T; + + ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(int8_t)); + ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(int8_t)); + ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(int16_t)); + ret = posix_memalign((void**)&result_sse4_1, 16, vlen*2*sizeof(int16_t)); + + inputInt8_T = (int8_t*)input; + tapsInt8_T = (int8_t*)taps; + for(int i = 0; i < vlen*2; ++i) { + inputInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + tapsInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + + printf("8sc_multiply_conjugate_16sc_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_multiply_conjugate_16sc_aligned16_manual((std::complex*)result_generic, (std::complex*)input, (std::complex*)taps, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_multiply_conjugate_16sc_aligned16((std::complex*)result_sse4_1, (std::complex*)input, (std::complex*)taps, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + //printf("%d %d+%di %d+%di -> %d+%di %d+%di\n", i, std::real(input[i]), std::imag(input[i]), std::real(taps[i]), std::imag(taps[i]), std::real(result_generic[i]), std::imag(result_generic[i]), std::real(result_sse4_1[i]), std::imag(result_sse4_1[i])); + + assertcomplexEqual(result_generic[i], result_sse4_1[i], ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse4_1); + +} + +#endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h new file mode 100644 index 000000000..0e78a5eca --- /dev/null +++ b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H +#define INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H + +#include +#include + +class qa_8sc_multiply_conjugate_16sc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_multiply_conjugate_16sc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc new file mode 100644 index 000000000..c27f0e0ca --- /dev/null +++ b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc @@ -0,0 +1,87 @@ +#include +#include +#include +#include +#include +#include + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +#ifndef LV_HAVE_SSE4_1 + +void qa_8sc_multiply_conjugate_32fc_aligned16::t1() { + printf("sse4.1 not available... no test performed\n"); +} + +#else + +void qa_8sc_multiply_conjugate_32fc_aligned16::t1() { + + + volk_runtime_init(); + + const int vlen = 2046; + const int ITERS = 100000; + + volk_environment_init(); + int ret; + clock_t start, end; + double total; + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result_sse4_1; + int i; + int8_t* inputInt8_T; + int8_t* tapsInt8_T; + + ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(int8_t)); + ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(int8_t)); + ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float)); + ret = posix_memalign((void**)&result_sse4_1, 16, vlen*2*sizeof(float)); + + + inputInt8_T = (int8_t*)input; + tapsInt8_T = (int8_t*)taps; + for(int i = 0; i < vlen*2; ++i) { + inputInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + tapsInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); + } + + printf("8sc_multiply_conjugate_32fc_aligned16\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8sc_multiply_conjugate_32fc_aligned16_manual(result_generic, (const std::complex*)input, (const std::complex*)taps, 32768.0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + get_volk_runtime()->volk_8sc_multiply_conjugate_32fc_aligned16(result_sse4_1, (const std::complex*)input, (const std::complex*)taps, 32768.0, vlen); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4_1_time: %f\n", total); + + for(i = 0; i < vlen; i++){ + //printf("%d %d+%di %d+%di -> %e+%ei %e+%ei\n", i, std::real(input[i]), std::imag(input[i]), std::real(taps[i]), std::imag(taps[i]), std::real(result_generic[i]), std::imag(result_generic[i]), std::real(result_sse4_1[i]), std::imag(result_sse4_1[i])); + assertcomplexEqual(result_generic[i], result_sse4_1[i], ERR_DELTA); + } + + free(input); + free(taps); + free(result_generic); + free(result_sse4_1); + +} + +#endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h new file mode 100644 index 000000000..eb9ae309c --- /dev/null +++ b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H +#define INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H + +#include +#include + +class qa_8sc_multiply_conjugate_32fc_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_8sc_multiply_conjugate_32fc_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H */ diff --git a/volk/lib/qa_volk.cc b/volk/lib/qa_volk.cc new file mode 100644 index 000000000..c3c27b69b --- /dev/null +++ b/volk/lib/qa_volk.cc @@ -0,0 +1,211 @@ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +/* + * This class gathers together all the test cases for the example + * directory into a single test suite. As you create new test cases, + * add them here. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +CppUnit::TestSuite * +qa_volk::suite() +{ + CppUnit::TestSuite *s = new CppUnit::TestSuite("volk"); + + s->addTest(qa_16s_quad_max_star_aligned16::suite()); + s->addTest(qa_32fc_dot_prod_aligned16::suite()); + s->addTest(qa_32fc_square_dist_scalar_mult_aligned16::suite()); + s->addTest(qa_32fc_square_dist_aligned16::suite()); + s->addTest(qa_32f_sum_of_poly_aligned16::suite()); + s->addTest(qa_32fc_index_max_aligned16::suite()); + s->addTest(qa_32f_index_max_aligned16::suite()); + s->addTest(qa_32fc_conjugate_dot_prod_aligned16::suite()); + s->addTest(qa_16s_permute_and_scalar_add_aligned16::suite()); + s->addTest(qa_16s_branch_4_state_8_aligned16::suite()); + s->addTest(qa_16s_max_star_horizontal_aligned16::suite()); + s->addTest(qa_16s_max_star_aligned16::suite()); + s->addTest(qa_16s_add_quad_aligned16::suite()); + s->addTest(qa_32f_add_aligned16::suite()); + s->addTest(qa_32f_subtract_aligned16::suite()); + s->addTest(qa_32f_max_aligned16::suite()); + s->addTest(qa_32f_min_aligned16::suite()); + s->addTest(qa_64f_max_aligned16::suite()); + s->addTest(qa_64f_min_aligned16::suite()); + s->addTest(qa_32s_and_aligned16::suite()); + s->addTest(qa_32s_or_aligned16::suite()); + s->addTest(qa_32f_dot_prod_aligned16::suite()); + s->addTest(qa_32f_dot_prod_unaligned16::suite()); + s->addTest(qa_32f_fm_detect_aligned16::suite()); + s->addTest(qa_32fc_32f_multiply_aligned16::suite()); + s->addTest(qa_32fc_multiply_aligned16::suite()); + s->addTest(qa_32f_divide_aligned16::suite()); + s->addTest(qa_32f_multiply_aligned16::suite()); + s->addTest(qa_32f_sqrt_aligned16::suite()); + s->addTest(qa_8sc_multiply_conjugate_16sc_aligned16::suite()); + s->addTest(qa_8sc_multiply_conjugate_32fc_aligned16::suite()); + s->addTest(qa_32u_popcnt_aligned16::suite()); + s->addTest(qa_64u_popcnt_aligned16::suite()); + s->addTest(qa_16u_byteswap_aligned16::suite()); + s->addTest(qa_32u_byteswap_aligned16::suite()); + s->addTest(qa_64u_byteswap_aligned16::suite()); + s->addTest(qa_32f_normalize_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_16s_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_32f_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_real_16s_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_real_32f_aligned16::suite()); + s->addTest(qa_16sc_deinterleave_real_8s_aligned16::suite()); + s->addTest(qa_16sc_magnitude_16s_aligned16::suite()); + s->addTest(qa_16sc_magnitude_32f_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_32f_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_64f_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_real_16s_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_real_32f_aligned16::suite()); + s->addTest(qa_32fc_deinterleave_real_64f_aligned16::suite()); + s->addTest(qa_32fc_magnitude_16s_aligned16::suite()); + s->addTest(qa_32fc_magnitude_32f_aligned16::suite()); + s->addTest(qa_32f_interleave_16sc_aligned16::suite()); + s->addTest(qa_32f_interleave_32fc_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_16s_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_32f_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_real_16s_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_real_32f_aligned16::suite()); + s->addTest(qa_8sc_deinterleave_real_8s_aligned16::suite()); + s->addTest(qa_16s_convert_32f_aligned16::suite()); + s->addTest(qa_16s_convert_32f_unaligned16::suite()); + s->addTest(qa_16s_convert_8s_aligned16::suite()); + s->addTest(qa_16s_convert_8s_unaligned16::suite()); + s->addTest(qa_32f_convert_16s_aligned16::suite()); + s->addTest(qa_32f_convert_16s_unaligned16::suite()); + s->addTest(qa_32f_convert_32s_aligned16::suite()); + s->addTest(qa_32f_convert_32s_unaligned16::suite()); + s->addTest(qa_32f_convert_64f_aligned16::suite()); + s->addTest(qa_32f_convert_64f_unaligned16::suite()); + s->addTest(qa_32f_convert_8s_aligned16::suite()); + s->addTest(qa_32f_convert_8s_unaligned16::suite()); + s->addTest(qa_32s_convert_32f_aligned16::suite()); + s->addTest(qa_32s_convert_32f_unaligned16::suite()); + s->addTest(qa_64f_convert_32f_aligned16::suite()); + s->addTest(qa_64f_convert_32f_unaligned16::suite()); + s->addTest(qa_8s_convert_16s_aligned16::suite()); + s->addTest(qa_8s_convert_16s_unaligned16::suite()); + s->addTest(qa_8s_convert_32f_aligned16::suite()); + s->addTest(qa_8s_convert_32f_unaligned16::suite()); + s->addTest(qa_32fc_32f_power_32fc_aligned16::suite()); + s->addTest(qa_32f_power_aligned16::suite()); + s->addTest(qa_32fc_atan2_32f_aligned16::suite()); + s->addTest(qa_32fc_power_spectral_density_32f_aligned16::suite()); + s->addTest(qa_32fc_power_spectrum_32f_aligned16::suite()); + s->addTest(qa_32f_calc_spectral_noise_floor_aligned16::suite()); + s->addTest(qa_32f_accumulator_aligned16::suite()); + s->addTest(qa_32f_stddev_aligned16::suite()); + s->addTest(qa_32f_stddev_and_mean_aligned16::suite()); + + return s; +} diff --git a/volk/lib/qa_volk.h b/volk/lib/qa_volk.h new file mode 100644 index 000000000..43fa7faba --- /dev/null +++ b/volk/lib/qa_volk.h @@ -0,0 +1,36 @@ +/* -*- c++ -*- */ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU Example Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Example Public License for more details. + * + * You should have received a copy of the GNU Example Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_QA_VOLK_H +#define INCLUDED_QA_VOLK_H + +#include + +//! collect all the tests for the example directory + +class qa_volk { + public: + //! return suite of tests for all of example directory + static CppUnit::TestSuite *suite (); +}; + +#endif /* INCLUDED_QA_VOLK_H */ diff --git a/volk/lib/test_all.cc b/volk/lib/test_all.cc new file mode 100644 index 000000000..50ac08eab --- /dev/null +++ b/volk/lib/test_all.cc @@ -0,0 +1,82 @@ +/* -*- c++ -*- */ +/* + * Copyright 2002,2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +int +main (int argc, char **argv) +{ + + int opt = 0; + std::string xmlOutputFile(""); + + while( (opt = getopt(argc, argv, "o:")) != -1){ + switch(opt){ + case 'o': + if(optarg){ + xmlOutputFile.assign(optarg); + } + else{ + std::cerr << "No xml file output specified for -o" << std::endl; + exit(EXIT_FAILURE); + } + break; + + default: /* '?' */ + fprintf(stderr, "Usage: %s [-o] \"xml output file\"\n", + argv[0]); + exit(EXIT_FAILURE); + } + + } + + CppUnit::TextUi::TestRunner runner; + + runner.addTest (qa_volk::suite ()); + + bool was_successful = false; + if(!xmlOutputFile.empty()){ + std::ofstream xmlOutput(xmlOutputFile.c_str()); + if(xmlOutput.is_open()){ + runner.setOutputter(new CppUnit::XmlOutputter(&runner.result(), xmlOutput)); + + was_successful = runner.run("", false, true, false); + } + xmlOutput.close(); + } + else{ + was_successful = runner.run ("", false); + } + + return was_successful ? 0 : 1; +} diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c new file mode 100644 index 000000000..b1a93db26 --- /dev/null +++ b/volk/lib/volk_rank_archs.c @@ -0,0 +1,13 @@ +#include +#include + +unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch) { + int i = 2; + unsigned int best_val = 0; + for(; i < arch_defs[0] + 1; ++i) { + if((arch_defs[i]&(!arch)) == 0) { + best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val; + } + } + return best_val; +} diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h new file mode 100644 index 000000000..26b9f7503 --- /dev/null +++ b/volk/lib/volk_rank_archs.h @@ -0,0 +1,14 @@ +#ifndef INCLUDED_VOLK_RANK_ARCHS_H +#define INCLUDED_VOLK_RANK_ARCHS_H + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch); + + +#ifdef __cplusplus +} +#endif +#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/ diff --git a/volk/libvector_replace.sh b/volk/libvector_replace.sh new file mode 100755 index 000000000..4c7e33e1b --- /dev/null +++ b/volk/libvector_replace.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +cd $1 +files=`ls` +for file in $files +do + sed 's/libvector/volk/g' < $file > tempfile + sed 's/LIBVECTOR/VOLK/g' < tempfile > $file +done +for file in $files +do + echo $file > tempfile + newfile=`sed 's/libvector/volk/g' < tempfile` + if (test "$file" != "$newfile"); then + mv $file $newfile + echo "changed $file to $newfile" + fi +done diff --git a/volk/python/Makefile.am b/volk/python/Makefile.am new file mode 100644 index 000000000..8485c4793 --- /dev/null +++ b/volk/python/Makefile.am @@ -0,0 +1,95 @@ +# +# Copyright 2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +include $(top_srcdir)/Makefile.common + +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(PYTHON_CPPFLAGS) +SWIGPYTHONARGS = $(SWIGPYTHONFLAGS) $(SWIGGRFLAGS) + +BUILT_SOURCES = $(swig_built_sources) + +MOSTLYCLEANFILES = $(BUILT_SOURCES) *.pyc + +# Don't distribute the output of swig +dist-hook: + @for file in $(swig_built_sources); do echo $(RM) $(distdir)/$$file; done + @for file in $(swig_built_sources); do $(RM) $(distdir)/$$file; done + + +# if we're not using python, ignore the rest... + +TESTS = + + +ALL_IFILES = \ + $(LOCAL_IFILES) \ + $(NON_LOCAL_IFILES) + +NON_LOCAL_IFILES = \ + $(GNURADIO_CORE_INCLUDEDIR)/swig/gnuradio.i + +LOCAL_IFILES = \ + libvector.i \ + libvector_square_ff.i + + +# These files are built by SWIG. The first is the C++ glue. +# The second is the python wrapper that loads the _libvector shared library +# and knows how to call our extensions. + +swig_built_sources = \ + libvector_swig.cc \ + libvector_swig.py + +# This gets libvector.py installed in the right place. +# Any hand-written python should be listed here too. +ourpython_PYTHON = \ + __init__.py \ + libvector_swig.py + +# Python QA code gets listed here. It's not installed. +noinst_PYTHON = + +# This library contains the swig generated glue +ourpyexec_LTLIBRARIES = _libvector_swig.la + +# These are the source files that go into the shared library +_libvector_swig_la_SOURCES = \ + libvector_swig.cc + +# Dummy C++ source to cause C++ linking +nodist_EXTRA__libvector_swig_la_SOURCES = dummy.cc + +# magic flags +_libvector_swig_la_LDFLAGS = $(NO_UNDEFINED) -module -avoid-version + +# Link the swig generated code against our library and some python magic. +_libvector_swig_la_LIBADD = \ + ../lib/liblibvector.la \ + $(PYTHON_LDFLAGS) + + +libvector_swig.cc libvector_swig.py: $(LOCAL_IFILES) $(ALL_IFILES) + $(SWIG) $(SWIGPYTHONARGS) -module libvector_swig -o libvector_swig.cc $(srcdir)/libvector.i + + +# These swig headers get installed in ${prefix}/include//swig +ourswiginclude_HEADERS = \ + $(LOCAL_IFILES) diff --git a/volk/python/__init__.py b/volk/python/__init__.py new file mode 100644 index 000000000..6513a31dd --- /dev/null +++ b/volk/python/__init__.py @@ -0,0 +1,53 @@ +# +# Copyright 2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +# The presence of this file turns this directory into a Python package + +# ---------------------------------------------------------------- +# Temporary workaround for ticket:181 (swig+python problem) +import sys +_RTLD_GLOBAL = 0 +try: + from dl import RTLD_GLOBAL as _RTLD_GLOBAL +except ImportError: + try: + from DLFCN import RTLD_GLOBAL as _RTLD_GLOBAL + except ImportError: + pass + +if _RTLD_GLOBAL != 0: + _dlopenflags = sys.getdlopenflags() + sys.setdlopenflags(_dlopenflags|_RTLD_GLOBAL) +# ---------------------------------------------------------------- + + +# import swig generated symbols into the libvector namespace +from libvector_swig import * + +# import any pure python here +# from libvector_foo import bar +# from libvector_baz import * + + +# ---------------------------------------------------------------- +# Tail of workaround +if _RTLD_GLOBAL != 0: + sys.setdlopenflags(_dlopenflags) # Restore original flags +# ---------------------------------------------------------------- diff --git a/volk/python/libvector.i b/volk/python/libvector.i new file mode 100644 index 000000000..598c20a2a --- /dev/null +++ b/volk/python/libvector.i @@ -0,0 +1,47 @@ +/* -*- c++ -*- */ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +%feature("autodoc", "1"); // generate python docstrings + +%include "exception.i" +%import "gnuradio.i" // the common stuff + +%{ +#include "gnuradio_swig_bug_workaround.h" // mandatory bug fix +#include +%} + +// ---------------------------------------------------------------- + +/* + * Gather all .i files in this directory together. + */ + +%{ + +// The .h files +#include + +%} + +// The .i files +%include + diff --git a/volk/python/libvector_square_ff.i b/volk/python/libvector_square_ff.i new file mode 100644 index 000000000..a1c547cc7 --- /dev/null +++ b/volk/python/libvector_square_ff.i @@ -0,0 +1,37 @@ +/* -*- c++ -*- */ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * First arg is the package prefix. + * Second arg is the name of the class minus the prefix. + * + * This does some behind-the-scenes magic so we can + * access gr_example_square_ff from python as howto.square_ff + */ +GR_SWIG_BLOCK_MAGIC(libvector,square_ff); + +libvector_square_ff_sptr libvector_make_square_ff (); + +class libvector_square_ff : public gr_sync_block +{ +private: + libvector_square_ff(); +}; diff --git a/volk/python/qa_square.py b/volk/python/qa_square.py new file mode 100755 index 000000000..1ee56411f --- /dev/null +++ b/volk/python/qa_square.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# +# Copyright 2004,2007,2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +from gnuradio import gr, gr_unittest +import libvector_swig as libvector + +class qa_libvector(gr_unittest.TestCase): + + def setUp(self): + self.tb = gr.top_block() + + def tearDown(self): + self.tb = None + + def test_001_square_ff(self): + src_data = (-3, 4, -5.5, 2, 3) + expected_result = (9, 16, 30.25, 4, 9) + src = gr.vector_source_f(src_data) + sqr = libvector.square_ff() + dst = gr.vector_sink_f() + self.tb.connect(src, sqr) + self.tb.connect(sqr, dst) + self.tb.run() + result_data = dst.data() + self.assertFloatTuplesAlmostEqual(expected_result, result_data, 6) + +if __name__ == '__main__': + gr_unittest.main() diff --git a/volk/python/run_tests.in b/volk/python/run_tests.in new file mode 100644 index 000000000..895a849bd --- /dev/null +++ b/volk/python/run_tests.in @@ -0,0 +1,50 @@ +#!/bin/sh + +# All this strange PYTHONPATH manipulation is required to run our +# tests using our just built shared library and swig-generated python +# code prior to installation. + +# build tree == src tree unless you're doing a VPATH build. +# If you don't know what a VPATH build is, you're not doing one. Relax... + +prefix=@prefix@ +exec_prefix=@exec_prefix@ + +# Where to look in the build tree for our shared library +libbld=@abs_top_builddir@/python +# Where to look in the src tree for swig generated python code +libsrc=@abs_top_builddir@/python +# Where to look in the src tree for hand written python code +py=@abs_top_srcdir@/python + +# Where to look for installed GNU Radio python modules +# FIXME this is wrong on a distcheck. We really need to ask gnuradio-core +# where it put its python files. +installed_pythondir=@pythondir@ +installed_pyexecdir=@pyexecdir@ + +PYTHONPATH="$libbld:$libbld/.libs:$libsrc:$py:$installed_pythondir:$installed_pyexecdir:$PYTHONPATH" +#PYTHONPATH="$libbld:$libbld/.libs:$libsrc:$py:$installed_pythondir:$installed_pyexecdir" + +export PYTHONPATH + +# +# This is the simple part... +# Run everything that matches qa_*.py and return the final result. +# + +ok=yes +for file in @srcdir@/qa_*.py +do + if ! $file + then + ok=no + fi +done + +if [ $ok = yes ] +then + exit 0 +else + exit 1 +fi diff --git a/volk/spu_lib/gc_spu_macs.h b/volk/spu_lib/gc_spu_macs.h new file mode 100644 index 000000000..8e3e3f2a6 --- /dev/null +++ b/volk/spu_lib/gc_spu_macs.h @@ -0,0 +1,380 @@ +/* -*- asm -*- */ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef INCLUDED_GC_SPU_MACS_H +#define INCLUDED_GC_SPU_MACS_H + +/* + * This file contains a set of macros that are generally useful when + * coding in SPU assembler + * + * Note that the multi-instruction macros in here may overwrite + * registers 77, 78, and 79 without warning. + */ + +/* + * defines for all registers + */ +#define r0 $0 +#define r1 $1 +#define r2 $2 +#define r3 $3 +#define r4 $4 +#define r5 $5 +#define r6 $6 +#define r7 $7 +#define r8 $8 +#define r9 $9 +#define r10 $10 +#define r11 $11 +#define r12 $12 +#define r13 $13 +#define r14 $14 +#define r15 $15 +#define r16 $16 +#define r17 $17 +#define r18 $18 +#define r19 $19 +#define r20 $20 +#define r21 $21 +#define r22 $22 +#define r23 $23 +#define r24 $24 +#define r25 $25 +#define r26 $26 +#define r27 $27 +#define r28 $28 +#define r29 $29 +#define r30 $30 +#define r31 $31 +#define r32 $32 +#define r33 $33 +#define r34 $34 +#define r35 $35 +#define r36 $36 +#define r37 $37 +#define r38 $38 +#define r39 $39 +#define r40 $40 +#define r41 $41 +#define r42 $42 +#define r43 $43 +#define r44 $44 +#define r45 $45 +#define r46 $46 +#define r47 $47 +#define r48 $48 +#define r49 $49 +#define r50 $50 +#define r51 $51 +#define r52 $52 +#define r53 $53 +#define r54 $54 +#define r55 $55 +#define r56 $56 +#define r57 $57 +#define r58 $58 +#define r59 $59 +#define r60 $60 +#define r61 $61 +#define r62 $62 +#define r63 $63 +#define r64 $64 +#define r65 $65 +#define r66 $66 +#define r67 $67 +#define r68 $68 +#define r69 $69 +#define r70 $70 +#define r71 $71 +#define r72 $72 +#define r73 $73 +#define r74 $74 +#define r75 $75 +#define r76 $76 +#define r77 $77 +#define r78 $78 +#define r79 $79 +#define r80 $80 +#define r81 $81 +#define r82 $82 +#define r83 $83 +#define r84 $84 +#define r85 $85 +#define r86 $86 +#define r87 $87 +#define r88 $88 +#define r89 $89 +#define r90 $90 +#define r91 $91 +#define r92 $92 +#define r93 $93 +#define r94 $94 +#define r95 $95 +#define r96 $96 +#define r97 $97 +#define r98 $98 +#define r99 $99 +#define r100 $100 +#define r101 $101 +#define r102 $102 +#define r103 $103 +#define r104 $104 +#define r105 $105 +#define r106 $106 +#define r107 $107 +#define r108 $108 +#define r109 $109 +#define r110 $110 +#define r111 $111 +#define r112 $112 +#define r113 $113 +#define r114 $114 +#define r115 $115 +#define r116 $116 +#define r117 $117 +#define r118 $118 +#define r119 $119 +#define r120 $120 +#define r121 $121 +#define r122 $122 +#define r123 $123 +#define r124 $124 +#define r125 $125 +#define r126 $126 +#define r127 $127 + + +#define lr r0 // link register +#define sp r1 // stack pointer + // r2 is environment pointer for langs that need it (ALGOL) + +#define retval r3 // return values are passed in regs starting at r3 + +#define arg1 r3 // args are passed in regs starting at r3 +#define arg2 r4 +#define arg3 r5 +#define arg4 r6 +#define arg5 r7 +#define arg6 r8 +#define arg7 r9 +#define arg8 r10 +#define arg9 r11 +#define arg10 r12 + +// r3 - r74 are volatile (caller saves) +// r74 - r79 are volatile (scratch regs possibly destroyed by fct prolog/epilog) +// r80 - r127 are non-volatile (caller-saves) + +// scratch registers reserved for use by the macros in this file. + +#define _gc_t0 r79 +#define _gc_t1 r78 +#define _gc_t2 r77 + +/* + * ---------------------------------------------------------------- + * pseudo ops + * ---------------------------------------------------------------- + */ +#define PROC_ENTRY(name) \ + .text; \ + .p2align 4; \ + .global name; \ + .type name, @function; \ +name: + +/* + * ---------------------------------------------------------------- + * aliases for common operations + * ---------------------------------------------------------------- + */ + +// Move register (even pipe, 2 cycles) +#define MR(rt, ra) or rt, ra, ra; + +// Move register (odd pipe, 4 cycles) +#define LMR(rt, ra) rotqbyi rt, ra, 0; + +// return +#define RETURN() bi lr; + +// hint for a return +#define HINT_RETURN(ret_label) hbr ret_label, lr; + +// return if zero +#define BRZ_RETURN(rt) biz rt, lr; + +// return if not zero +#define BRNZ_RETURN(rt) binz rt, lr; + +// return if halfword zero +#define BRHZ_RETURN(rt) bihz rt, lr; + +// return if halfword not zero +#define BRHNZ_RETURN(rt) bihnz rt, lr; + + +/* + * ---------------------------------------------------------------- + * modulo like things for constant moduli that are powers of 2 + * ---------------------------------------------------------------- + */ + +// rt = ra & (pow2 - 1) +#define MODULO(rt, ra, pow2) \ + andi rt, ra, (pow2)-1; + +// rt = pow2 - (ra & (pow2 - 1)) +#define MODULO_NEG(rt, ra, pow2) \ + andi rt, ra, (pow2)-1; \ + sfi rt, rt, (pow2); + +// rt = ra & -(pow2) +#define ROUND_DOWN(rt, ra, pow2) \ + andi rt, ra, -(pow2); + +// rt = (ra + (pow2 - 1)) & -(pow2) +#define ROUND_UP(rt, ra, pow2) \ + ai rt, ra, (pow2)-1; \ + andi rt, rt, -(pow2); + +/* + * ---------------------------------------------------------------- + * Splat - replicate a particular slot into all slots + * Altivec analogs... + * ---------------------------------------------------------------- + */ + +// replicate byte from slot s [0,15] +#define VSPLTB(rt, ra, s) \ + ilh _gc_t0, (s)*0x0101; \ + shufb rt, ra, ra, _gc_t0; + +// replicate halfword from slot s [0,7] +#define VSPLTH(rt, ra, s) \ + ilh _gc_t0, 2*(s)*0x0101 + 0x0001; \ + shufb rt, ra, ra, _gc_t0; + +// replicate word from slot s [0,3] +#define VSPLTW(rt, ra, s) \ + iluh _gc_t0, 4*(s)*0x0101 + 0x0001; \ + iohl _gc_t0, 4*(s)*0x0101 + 0x0203; \ + shufb rt, ra, ra, _gc_t0; + +// replicate double from slot s [0,1] +#define VSPLTD(rt, ra, s) \ + /* sp is always 16-byte aligned */ \ + cdd _gc_t0, 8(sp); /* 0x10111213 14151617 00010203 04050607 */ \ + rotqbyi rt, ra, ra, (s) << 3; /* rotate double into preferred slot */ \ + shufb rt, rt, rt, _gc_t0; + +/* + * ---------------------------------------------------------------- + * lots of min/max variations... + * + * On a slot by slot basis, compute the min or max + * + * U - unsigned, else signed + * B,H,{} - byte, halfword, word + * F float + * ---------------------------------------------------------------- + */ + +#define MIN_SELB(rt, ra, rb, rc) selb rt, ra, rb, rc; +#define MAX_SELB(rt, ra, rb, rc) selb rt, rb, ra, rc; + + // words + +#define MIN(rt, ra, rb) \ + cgt _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define MAX(rt, ra, rb) \ + cgt _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + +#define UMIN(rt, ra, rb) \ + clgt _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define UMAX(rt, ra, rb) \ + clgt _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + + // bytes + +#define MINB(rt, ra, rb) \ + cgtb _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define MAXB(rt, ra, rb) \ + cgtb _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + +#define UMINB(rt, ra, rb) \ + clgtb _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define UMAXB(rt, ra, rb) \ + clgtb _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + + // halfwords + +#define MINH(rt, ra, rb) \ + cgth _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define MAXH(rt, ra, rb) \ + cgth _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + +#define UMINH(rt, ra, rb) \ + clgth _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define UMAXH(rt, ra, rb) \ + clgth _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + + // floats + +#define FMIN(rt, ra, rb) \ + fcgt _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define FMAX(rt, ra, rb) \ + fcgt _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + +// Ignoring the sign, select the values with the minimum magnitude +#define FMINMAG(rt, ra, rb) \ + fcmgt _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +// Ignoring the sign, select the values with the maximum magnitude +#define FMAXMAG(rt, ra, rb) \ + fcmgt _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + + +#endif /* INCLUDED_GC_SPU_MACS_H */ diff --git a/volk/spu_lib/spu_16s_cmpgt_unaligned.c b/volk/spu_lib/spu_16s_cmpgt_unaligned.c new file mode 100644 index 000000000..765cacd9a --- /dev/null +++ b/volk/spu_lib/spu_16s_cmpgt_unaligned.c @@ -0,0 +1,160 @@ +#include + +void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, unsigned int num_bytes){ + //loop iterator i + int i = 0; + void* retval = target; + + + //put the target and source addresses into qwords + vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; + vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0}; + + //create shuffle masks + + //shuffle mask building blocks: + //all from the first vector + vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + //all from the second vector + vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; + + + + //gamma: second half of the second, first half of the first, break at (unsigned int)src%16 + vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16)); + vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); + vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); + vector unsigned char cmp_res = spu_or(gt_res, eq_res); + vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); + vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16); + + + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); + vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); + + //alpha: first half of first, second half of second, break at (unsigned int)target%16 + src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + //delta: first half of first, first half of second, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); + //epsilon: second half of second, second half of first, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); + //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + + //beta: first half of first, second half of second, break at num_bytes%16 + src_cmp = spu_splats((unsigned char)(num_bytes%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + + + + + + qword src_past; + qword src_present; + qword tgt_past; + qword tgt_present; + + qword in_temp; + qword out_temp0; + qword out_temp1; + + src_past = si_lqd((qword)address_counter_src, 0); + tgt_past = si_lqd((qword)address_counter_tgt, 0); + + vector signed short vec_val = spu_splats(val); + vector unsigned short compare; + vector unsigned short ones = {1, 1, 1, 1, 1, 1, 1, 1}; + vector unsigned short after_and; + + for(i = 0; i < num_bytes/16; ++i) { + + src_present = si_lqd((qword)address_counter_src, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma); + + compare = spu_cmpgt((vector signed short) in_temp, vec_val); + after_and = spu_and(compare, ones); + + + out_temp0 = spu_shuffle(tgt_past, (qword)after_and, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, (qword)after_and, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + tgt_past = out_temp1; + src_past = src_present; + address_counter_src = spu_add(address_counter_src, 16); + address_counter_tgt = spu_add(address_counter_tgt, 16); + + + } + + src_present = si_lqd((qword)address_counter_src, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + + in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma); + + compare = spu_cmpgt((vector signed short) in_temp, vec_val); + after_and = spu_and(compare, ones); + + + qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); + qword meld = spu_shuffle((qword)after_and, target_temp, (vector unsigned char)shuffle_mask_beta); + + + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + return retval; +} + + + +/* +int main(){ + + signed short pooh[48]; + signed short bear[48]; + + int i = 0; + for(i = 0; i < 48; i += 2){ + bear[i] = i; + bear[i + 1] = -i; + } + + vector_gt_16bit(&pooh[0],&bear[0], 0, 48 * sizeof(signed short)); + + for(i = 0; i < 48; ++i) { + printf("%d, ", pooh[i]); + } + printf("\n"); +} +*/ + diff --git a/volk/spu_lib/spu_16s_vector_subtract_unaligned.c b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c new file mode 100644 index 000000000..a3ce6c2fe --- /dev/null +++ b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c @@ -0,0 +1,178 @@ +#include + +void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* src1, unsigned int num_bytes){ + //loop iterator i + int i = 0; + void* retval = target; + + + //put the target and source addresses into qwords + vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; + vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0}; + vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0}; + + //create shuffle masks + + //shuffle mask building blocks: + //all from the first vector + vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + //all from the second vector + vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; + + + + //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16 + vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16)); + vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); + vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); + vector unsigned char cmp_res = spu_or(gt_res, eq_res); + vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); + vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16); + + //eta: second half of the second, first half of the first, break at (unsigned int)src1%16 + src_cmp = spu_splats((unsigned char)((unsigned int)src1%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + sixteen_uchar = spu_splats((unsigned char)16); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16); + + + + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); + vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); + + //alpha: first half of first, second half of second, break at (unsigned int)target%16 + src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + //delta: first half of first, first half of second, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); + //epsilon: second half of second, second half of first, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); + //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + + //beta: first half of first, second half of second, break at num_bytes%16 + src_cmp = spu_splats((unsigned char)(num_bytes%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + + + + + + qword src0_past; + qword src0_present; + qword src1_past; + qword src1_present; + qword tgt_past; + qword tgt_present; + + qword in_temp0; + qword in_temp1; + qword out_temp0; + qword out_temp1; + + vector signed short sum; + + src0_past = si_lqd((qword)address_counter_src0, 0); + src1_past = si_lqd((qword)address_counter_src1, 0); + tgt_past = si_lqd((qword)address_counter_tgt, 0); + + for(i = 0; i < num_bytes/16; ++i) { + + src0_present = si_lqd((qword)address_counter_src0, 16); + src1_present = si_lqd((qword)address_counter_src1, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma); + in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta); + + sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1); + + + out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + tgt_past = out_temp1; + src0_past = src0_present; + src1_past = src1_present; + address_counter_src0 = spu_add(address_counter_src0, 16); + address_counter_src1 = spu_add(address_counter_src1, 16); + address_counter_tgt = spu_add(address_counter_tgt, 16); + + + } + + src0_present = si_lqd((qword)address_counter_src0, 16); + src1_present = si_lqd((qword)address_counter_src1, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma); + in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta); + sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1); + qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); + qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta); + + + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + return retval; +} + + + +/* +int main(){ + + signed short pooh[48]; + signed short bear[48]; + signed short res[48]; + + int i = 0; + for(i = 0; i < 48; ++i){ + pooh[i] = i; + } + for(i = 48; i < 96; ++i){ + bear[i - 48] = i; + } + + vector_subtract_16bit(res, &pooh[0], &bear[0], 48 * sizeof(signed short)); + + for(i = 0; i < 48; ++i) { + printf("%d, ", res[i]); + } + printf("\n"); +} +*/ + diff --git a/volk/spu_lib/spu_16s_vector_sum_unaligned.c b/volk/spu_lib/spu_16s_vector_sum_unaligned.c new file mode 100644 index 000000000..5a1cb9aaf --- /dev/null +++ b/volk/spu_lib/spu_16s_vector_sum_unaligned.c @@ -0,0 +1,178 @@ +#include + +void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1, unsigned int num_bytes){ + //loop iterator i + int i = 0; + void* retval = target; + + + //put the target and source addresses into qwords + vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; + vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0}; + vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0}; + + //create shuffle masks + + //shuffle mask building blocks: + //all from the first vector + vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + //all from the second vector + vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; + + + + //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16 + vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16)); + vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); + vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); + vector unsigned char cmp_res = spu_or(gt_res, eq_res); + vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); + vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16); + + //eta: second half of the second, first half of the first, break at (unsigned int)src1%16 + src_cmp = spu_splats((unsigned char)((unsigned int)src1%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + sixteen_uchar = spu_splats((unsigned char)16); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16); + + + + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); + vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); + + //alpha: first half of first, second half of second, break at (unsigned int)target%16 + src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + //delta: first half of first, first half of second, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); + //epsilon: second half of second, second half of first, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); + //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + + //beta: first half of first, second half of second, break at num_bytes%16 + src_cmp = spu_splats((unsigned char)(num_bytes%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + + + + + + qword src0_past; + qword src0_present; + qword src1_past; + qword src1_present; + qword tgt_past; + qword tgt_present; + + qword in_temp0; + qword in_temp1; + qword out_temp0; + qword out_temp1; + + vector signed int sum; + + src0_past = si_lqd((qword)address_counter_src0, 0); + src1_past = si_lqd((qword)address_counter_src1, 0); + tgt_past = si_lqd((qword)address_counter_tgt, 0); + + for(i = 0; i < num_bytes/16; ++i) { + + src0_present = si_lqd((qword)address_counter_src0, 16); + src1_present = si_lqd((qword)address_counter_src1, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma); + in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta); + + sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1); + + + out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + tgt_past = out_temp1; + src0_past = src0_present; + src1_past = src1_present; + address_counter_src0 = spu_add(address_counter_src0, 16); + address_counter_src1 = spu_add(address_counter_src1, 16); + address_counter_tgt = spu_add(address_counter_tgt, 16); + + + } + + src0_present = si_lqd((qword)address_counter_src0, 16); + src1_present = si_lqd((qword)address_counter_src1, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma); + in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta); + sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1); + qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); + qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta); + + + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + return retval; +} + + + +/* +int main(){ + + signed short pooh[48]; + signed short bear[48]; + signed short res[48]; + + int i = 0; + for(i = 0; i < 48; ++i){ + pooh[i] = i; + } + for(i = 48; i < 96; ++i){ + bear[i - 48] = i; + } + + vector_sum(&pooh[9], &pooh[9], &bear[3], 30); + + for(i = 0; i < 48; ++i) { + printf("%d, ", pooh[i]); + } + printf("\n"); +} +*/ + diff --git a/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c new file mode 100644 index 000000000..58fd4aa0c --- /dev/null +++ b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c @@ -0,0 +1,222 @@ +#include + + + + +void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, void* src1, unsigned int num_bytes){ + //loop iterator i + int i = 0; + void* retval = target; + + + //put the target and source addresses into qwords + vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; + vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0}; + vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0}; + + //create shuffle masks + + //shuffle mask building blocks: + //all from the first vector + vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + //all from the second vector + vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; + + + + //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16 + vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16)); + vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); + vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); + vector unsigned char cmp_res = spu_or(gt_res, eq_res); + vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); + vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16); + + //eta: second half of the second, first half of the first, break at (unsigned int)src1%16 + src_cmp = spu_splats((unsigned char)((unsigned int)src1%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + sixteen_uchar = spu_splats((unsigned char)16); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16); + + + + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); + vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); + + //alpha: first half of first, second half of second, break at (unsigned int)target%16 + src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + //delta: first half of first, first half of second, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); + //epsilon: second half of second, second half of first, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); + //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + + //beta: first half of first, second half of second, break at num_bytes%16 + src_cmp = spu_splats((unsigned char)(num_bytes%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + + + + + + qword src0_past; + qword src0_present; + qword src1_past; + qword src1_present; + qword tgt_past; + qword tgt_present; + + qword in_temp0; + qword in_temp1; + qword out_temp0; + qword out_temp1; + + + src0_past = si_lqd((qword)address_counter_src0, 0); + src1_past = si_lqd((qword)address_counter_src1, 0); + tgt_past = si_lqd((qword)address_counter_tgt, 0); + + vector unsigned char shuffle_mask_complexprod0 = {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, + 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b}; + vector unsigned char shuffle_mask_complexprod1 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, + 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b}; + vector unsigned char shuffle_mask_complexprod2 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, + 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f}; + vector unsigned char sign_changer = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00}; + + vector float prod0; + qword shuf0; + vector float prod1; + vector float sign_change; + qword summand0; + qword summand1; + vector float sum; + + + for(i = 0; i < num_bytes/16; ++i) { + + src0_present = si_lqd((qword)address_counter_src0, 16); + src1_present = si_lqd((qword)address_counter_src1, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma); + in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta); + + prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1); + shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0); + prod1 = spu_mul((vector float)in_temp0, (vector float)shuf0); + sign_change = spu_xor(prod0, (vector float)sign_changer); + + summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1); + + summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2); + + sum = spu_add((vector float)summand0, (vector float)summand1); + + + out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + tgt_past = out_temp1; + src0_past = src0_present; + src1_past = src1_present; + address_counter_src0 = spu_add(address_counter_src0, 16); + address_counter_src1 = spu_add(address_counter_src1, 16); + address_counter_tgt = spu_add(address_counter_tgt, 16); + + + } + + src0_present = si_lqd((qword)address_counter_src0, 16); + src1_present = si_lqd((qword)address_counter_src1, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma); + in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta); + + + prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1); + shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0); + prod1 = spu_mul(prod0, (vector float)shuf0); + sign_change = spu_xor(prod0, (vector float)sign_changer); + summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1); + summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2); + sum = spu_add((vector float)summand0, (vector float)summand1); + + + + qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); + qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta); + + + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + return retval; +} + + + +/* +int main(){ + + float pooh[48]; + float bear[48]; + float res[48]; + + int i = 0; + for(i = 0; i < 48; ++i){ + pooh[i] = (float) i; + } + for(i = 48; i < 96; ++i){ + bear[i - 48] = (float) i; + } + + vector_product_complex(res, pooh, bear, 48*sizeof(float)); + + + + for(i = 0; i < 48; ++i) { + printf("%f, ", res[i]); + } + printf("\n"); + + +} +*/ + diff --git a/volk/spu_lib/spu_memcpy_unaligned.c b/volk/spu_lib/spu_memcpy_unaligned.c new file mode 100644 index 000000000..2a0dabcd7 --- /dev/null +++ b/volk/spu_lib/spu_memcpy_unaligned.c @@ -0,0 +1,290 @@ +#include + +void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes){ + //loop iterator i + int i = 0; + void* retval = target; + + + //put the target and source addresses into qwords + vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; + vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0}; + + //create shuffle masks + + //shuffle mask building blocks: + //all from the first vector + vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + //all from the second vector + vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; + + + + //gamma: second half of the second, first half of the first, break at (unsigned int)src%16 + vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16)); + vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); + vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); + vector unsigned char cmp_res = spu_or(gt_res, eq_res); + vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); + vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16); + + + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); + vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); + + //alpha: first half of first, second half of second, break at (unsigned int)target%16 + src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + //delta: first half of first, first half of second, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); + //epsilon: second half of second, second half of first, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); + //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + + //beta: first half of first, second half of second, break at num_bytes%16 + src_cmp = spu_splats((unsigned char)(num_bytes%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + + + + + + qword src_past; + qword src_present; + qword tgt_past; + qword tgt_present; + + qword in_temp; + qword out_temp0; + qword out_temp1; + + src_past = si_lqd((qword)address_counter_src, 0); + tgt_past = si_lqd((qword)address_counter_tgt, 0); + + for(i = 0; i < num_bytes/16; ++i) { + + src_present = si_lqd((qword)address_counter_src, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma); + + out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + tgt_past = out_temp1; + src_past = src_present; + address_counter_src = spu_add(address_counter_src, 16); + address_counter_tgt = spu_add(address_counter_tgt, 16); + + + } + + src_present = si_lqd((qword)address_counter_src, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + + in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma); + qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); + qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta); + + + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + return retval; +} + + + +/* +void* mcpy(void* target, void* src, size_t num_bytes){ + //loop iterator i + int i = 0; + void* retval = src; + + //put the target and source addresses into qwords + vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; + vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0}; + + //create shuffle masks + + //shuffle mask building blocks: + //all from the first vector + vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + //all from the second vector + vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; + + + + //gamma: second half of the second, first half of the first, break at src%16 + vector unsigned char src_cmp = spu_splats((unsigned char)(src%16)); + vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); + vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); + vector unsigned char cmp_res = spu_or(gt_res, eq_res); + vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); + vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, src%16); + + + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -(target%16)); + vector unsigned char tgt_first = spu_rlqwbyte(oneup, -(target%16)); + + //alpha: first half of first, second half of second, break at target%16 + src_cmp = spu_splats((unsigned char)(target%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + //delta: first half of first, first half of second, break at target%16 + vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); + //epsilon: second half of second, second half of first, break at target%16 + vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); + //zeta: second half of second, first half of first, break at 16 - target%16 + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, target%16); + + //beta: first half of first, second half of second, break at num_bytes%16 + src_cmp = spu_splats((unsigned char)(num_bytes%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + + printf("num_bytesmod16 %d\n", num_bytes%16); + printf("beta %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", + spu_extract((vector unsigned char) shuffle_mask_beta, 0), + spu_extract((vector unsigned char) shuffle_mask_beta, 1), + spu_extract((vector unsigned char) shuffle_mask_beta, 2), + spu_extract((vector unsigned char) shuffle_mask_beta, 3), + spu_extract((vector unsigned char) shuffle_mask_beta, 4), + spu_extract((vector unsigned char) shuffle_mask_beta, 5), + spu_extract((vector unsigned char) shuffle_mask_beta, 6), + spu_extract((vector unsigned char) shuffle_mask_beta, 7), + spu_extract((vector unsigned char) shuffle_mask_beta, 8), + spu_extract((vector unsigned char) shuffle_mask_beta, 9), + spu_extract((vector unsigned char) shuffle_mask_beta, 10), + spu_extract((vector unsigned char) shuffle_mask_beta, 11), + spu_extract((vector unsigned char) shuffle_mask_beta, 12), + spu_extract((vector unsigned char) shuffle_mask_beta, 13), + spu_extract((vector unsigned char) shuffle_mask_beta, 14), + spu_extract((vector unsigned char) shuffle_mask_beta, 15)); + + + + + + + + qword src_past; + qword src_present; + qword tgt_past; + qword tgt_present; + + qword in_temp; + qword out_temp0; + qword out_temp1; + + src_past = si_lqd((qword)address_counter_src, 0); + tgt_past = si_lqd((qword)address_counter_tgt, 0); + + for(i = 0; i < num_bytes/16; ++i) { + + src_present = si_lqd((qword)address_counter_src, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma); + + out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + tgt_past = out_temp1; + src_past = src_present; + address_counter_src = spu_add(address_counter_src, 16); + address_counter_tgt = spu_add(address_counter_tgt, 16); + + + } + + src_present = si_lqd((qword)address_counter_src, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + + in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma); + qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); + qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta); + + + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + return retval; + +} +*/ +/* +int main(){ + + unsigned char pooh[48]; + unsigned char bear[48]; + + int i = 0; + for(i = 0; i < 48; ++i){ + pooh[i] = i; + bear[i] = i; + } + + spu_mcpy(&pooh[9],&bear[3], 15); + + for(i = 0; i < 48; ++i) { + printf("%d, ", pooh[i]); + } + printf("\n"); +} + +*/ diff --git a/volk/spu_lib/spu_memset_unaligned.S b/volk/spu_lib/spu_memset_unaligned.S new file mode 100644 index 000000000..a655c4c52 --- /dev/null +++ b/volk/spu_lib/spu_memset_unaligned.S @@ -0,0 +1,185 @@ +/* -*- asm -*- */ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "gc_spu_macs.h" + + .file "spu_memset_unaligned.S" + + /* + * Computes this, only a lot faster... + * + * void * + * libvector_memset_unaligned(void *pv, int c, size_t n) + * { + * unsigned char *p = (unsigned char *) pv; + * size_t i; + * for (i = 0; i < n; i++) + * p[i] = c; + * + * return pv; + * } + */ + +#define p_arg arg1 // we're going to clobber arg1 w/ the return value +#define c arg2 // the constant we're writing +#define n arg3 // how many bytes to write + +#define p r13 // where we're writing +#define t0 r14 +#define t1 r15 +#define mask r16 +#define old r17 +#define an r18 // aligned n (n rounded down to mod 16 boundary) +#define next_p r19 +#define cond1 r20 +#define cond2 r21 +#define m r22 +#define r r23 + + PROC_ENTRY(libvector_memset_unaligned) + + // Hint the return from do_head, in case we go that way. + // There's pretty much nothing to can do to hint the branch to it. + hbrr do_head_br, head_complete + + MR(p, p_arg) // leaves p, the return value, in the correct reg (r3) + BRZ_RETURN(n) + + MODULO(t0, p, 16) // is p%16 == 0? + VSPLTB(c, c, 3) // splat byte in preferred slot of c into all slots + brnz t0, do_head // no, handle it +head_complete: + + /* + * preconditions: + * p%16 == 0, n > 0 + */ + hbrr middle_loop_br, middle_loop + + ROUND_DOWN(an, n, 16) // an is "aligned n" + MODULO(n, n, 16) // what's left over in the last quad + brz an, do_tail // no whole quad words; skip to tail + clgti t0, an, 127 // an >= 128? + brz t0, middle2 // nope, go handle the cases between 0 and 112 + + /* + * 128 bytes / iteration + */ + .p2align 4 +middle_loop: + ai an, an, -128 + stqd c, 0*16(p) + ai next_p, p, 128 + stqd c, 1*16(p) + cgti cond1, an, 127 + stqd c, 2*16(p) + + stqd c, 3*16(p) + stqd c, 4*16(p) + stqd c, 5*16(p) + stqd c, 6*16(p) + + MR(p, next_p) + stqd c, 7*16-128(next_p) + or cond2, n, an +middle_loop_br: + brnz cond1, middle_loop + + /* + * if an and n are both zero, return now + */ + BRZ_RETURN(cond2) + + /* + * otherwise handle last of full quad words + * + * 0 <= an < 128, p%16 == 0 + */ +middle2: + /* + * if an == 0, go handle the final non-full quadword + */ + brz an, do_tail + hbrr middle2_loop_br, middle2_loop + + .p2align 3 +middle2_loop: + ai next_p, p, 16 + stqd c, 0(p) + ai an, an, -16 + LMR(p, next_p) +middle2_loop_br: + brnz an, middle2_loop + + /* We're done with the full quadwords. */ + + /* + * Handle the final partial quadword. + * We'll be modifying only the left hand portion of the quad. + * + * preconditions: + * an == 0, 0 <= n < 16, p%16 == 0 + */ +do_tail: + HINT_RETURN(do_tail_ret) + il mask, -1 + sfi t1, n, 16 // t1 = 16 - n + lqd old, 0(p) + shlqby mask, mask, t1 + selb t0, old, c, mask + stqd t0, 0(p) +do_tail_ret: + RETURN() + + /* + * ---------------------------------------------------------------- + * Handle the first partial quadword + * + * preconditions: + * p%16 != 0 + * + * postconditions: + * p%16 == 0 or n == 0 + * + * |-- m --| + * +----------------+----------------+ + * | //////// | | + * +----------------+----------------+ + * |----- r -----| + * p + * ---------------------------------------------------------------- + */ +do_head: + lqd old, 0(p) + MODULO_NEG(r, p, 16) + il mask, -1 + UMIN(m, r, n) + shlqby mask, mask, m // 1's in the top, m*8 0's in the bottom + MR(t1, p) + sf t0, m, r // t0 = r - m + a p, p, m // p += m + rotqby mask, mask, t0 // rotate 0's to the right place + sf n, m, n // n -= m + selb t0, c, old, mask // merge + stqd t0, 0(t1) + BRZ_RETURN(n) +do_head_br: + br head_complete diff --git a/volk/system_cleanup.sh b/volk/system_cleanup.sh new file mode 100755 index 000000000..7ebd2caf8 --- /dev/null +++ b/volk/system_cleanup.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +cp volk_config.h include/volk/ +cd lib +case $1 in + (x86) + + case $2 in + (x86_64) + gcc -o volk_mktables -I../include -I. volk_cpu_x86.c cpuid_x86_64.S volk_rank_archs.c volk_mktables.c + ./volk_mktables + ;; + (*) + gcc -o volk_mktables -I../include -I. volk_cpu_x86.c cpuid_x86.S volk_rank_archs.c volk_mktables.c + ./volk_mktables + ;; + esac + ;; + (powerpc) + gcc -o volk_mktables -I../include -I. volk_cpu_powerpc.c volk_rank_archs.c volk_mktables.c + ./volk_mktables + ;; + (*) + gcc -o volk_mktables -I../include -I. volk_cpu_generic.c volk_rank_archs.c volk_mktables.c + ./volk_mktables + ;; +esac +mv volk_tables.h ../include/volk/ + diff --git a/volk/volk.pc.in b/volk/volk.pc.in new file mode 100644 index 000000000..a24298856 --- /dev/null +++ b/volk/volk.pc.in @@ -0,0 +1,15 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ +LV_CXXFLAGS=@LV_CXXFLAGS@ + + + +Name: volk +Description: VOLK.. Vector Optimized Library of Kernels +Requires: +Version: @VERSION@ +Libs: -lvolk -lvolk_runtime +Cflags: -I${includedir} ${LV_CXXFLAGS} + -- cgit From 1186ab980d1e1fe36ff869097fd0697dd41ff7ae Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 00:59:23 -0500 Subject: volk: Updating build system so that volk configures and builds from gnuradio top level dir. --- volk/Makefile.common | 4 +-- volk/apps/Makefile.am | 2 +- volk/config/lv_configure.m4 | 30 ++++++++++++-------- volk/configure.ac | 27 ++++++++++++++---- volk/include/volk/Makefile.am | 6 ++-- volk/python/Makefile.am | 65 ++++--------------------------------------- 6 files changed, 52 insertions(+), 82 deletions(-) (limited to 'volk') diff --git a/volk/Makefile.common b/volk/Makefile.common index 3b028b147..083f6f710 100644 --- a/volk/Makefile.common +++ b/volk/Makefile.common @@ -1,6 +1,6 @@ # -*- Makefile -*- # -# Copyright 2004,2006,2008 Free Software Foundation, Inc. +# Copyright 2010 Free Software Foundation, Inc. # # This file is part of GNU Radio # @@ -27,7 +27,7 @@ ourswigincludedir = $(ourincludedir)/swig # Install this stuff in the appropriate subdirectory # This usually ends up at: -# ${prefix}/lib/python${python_version}/site-packages/libvector +# ${prefix}/lib/python${python_version}/site-packages/libvolk ourpythondir = $(pythondir)/volk ourpyexecdir = $(pyexecdir)/volk diff --git a/volk/apps/Makefile.am b/volk/apps/Makefile.am index a51823971..5e9bc254f 100644 --- a/volk/apps/Makefile.am +++ b/volk/apps/Makefile.am @@ -22,6 +22,6 @@ include $(top_srcdir)/Makefile.common # C++ stuff here -if USE_PYTHON +if PYTHON # python stuff here endif diff --git a/volk/config/lv_configure.m4 b/volk/config/lv_configure.m4 index fc6a0a567..efcec5d3e 100644 --- a/volk/config/lv_configure.m4 +++ b/volk/config/lv_configure.m4 @@ -1,5 +1,20 @@ -dnl all this stuff taken and modified from GNURADIO! dnl +dnl Copyright 2010 Free Software Foundation, Inc. +dnl +dnl This program is free software: you can redistribute it and/or modify +dnl it under the terms of the GNU General Public License as published by +dnl the Free Software Foundation, either version 3 of the License, or +dnl (at your option) any later version. +dnl +dnl This program is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +dnl GNU General Public License for more details. +dnl +dnl You should have received a copy of the GNU General Public License +dnl along with this program. If not, see . +dnl + dnl LV_CONFIGURE dnl dnl Handles the bulk of the configure.ac work for an out-of-tree build @@ -18,16 +33,9 @@ dnl get called too late to be useful. m4_define([LV_CONFIGURE], [ - - AC_CONFIG_SRCDIR([config/lv_configure.m4]) - AC_CONFIG_AUX_DIR([.]) - AM_CONFIG_HEADER(config.h) - - AC_CANONICAL_BUILD - AC_CANONICAL_HOST - AC_CANONICAL_TARGET - - AM_INIT_AUTOMAKE(libvector,0.0svn) + #AC_CANONICAL_BUILD + #AC_CANONICAL_HOST + #AC_CANONICAL_TARGET LF_CONFIGURE_CC LF_CONFIGURE_CXX diff --git a/volk/configure.ac b/volk/configure.ac index eb9fbdc55..4c095cbd5 100644 --- a/volk/configure.ac +++ b/volk/configure.ac @@ -1,7 +1,26 @@ -AC_INIT(libvector,0.0svn) +dnl +dnl Copyright 2010 Free Software Foundation, Inc. +dnl +dnl This program is free software: you can redistribute it and/or modify +dnl it under the terms of the GNU General Public License as published by +dnl the Free Software Foundation, either version 3 of the License, or +dnl (at your option) any later version. +dnl +dnl This program is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +dnl GNU General Public License for more details. +dnl +dnl You should have received a copy of the GNU General Public License +dnl along with this program. If not, see . +dnl + +AC_INIT AC_PREREQ(2.57) AC_CONFIG_AUX_DIR([.]) - +AC_CONFIG_SRCDIR([lib/test_all.cc]) +AM_CONFIG_HEADER(config.h) +AM_INIT_AUTOMAKE(volk,0.0svn) dnl This is kind of non-standard, but it sure shortens up this file :-) @@ -53,10 +72,6 @@ AC_CONFIG_FILES([\ volk.pc \ ]) - - - - AC_CONFIG_COMMANDS([run_system_cleanup], [chmod +x system_cleanup.sh && ./system_cleanup.sh $MYCPU $MYSUBCPU], [MYCPU=$MD_CPU MYSUBCPU=$MD_SUBCPU]) AC_OUTPUT diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index fd59ab795..94c78ffff 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright 2008 Free Software Foundation, Inc. +# Copyright 2010 Free Software Foundation, Inc. # # This file is part of GNU Radio # @@ -20,7 +20,9 @@ include $(top_srcdir)/Makefile.common -ourinclude_HEADERS = \ +volkincludedir = $(prefix)/include/volk + +volkinclude_HEADERS = \ volk_complex.h \ volk_common.h \ volk_config_fixed.h \ diff --git a/volk/python/Makefile.am b/volk/python/Makefile.am index 8485c4793..8519a102a 100644 --- a/volk/python/Makefile.am +++ b/volk/python/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright 2008 Free Software Foundation, Inc. +# Copyright 2010 Free Software Foundation, Inc. # # This file is part of GNU Radio # @@ -33,63 +33,8 @@ dist-hook: @for file in $(swig_built_sources); do $(RM) $(distdir)/$$file; done -# if we're not using python, ignore the rest... +# Set the install location of any python scripts to /volk +volkpythondir = $(pythondir)/volk -TESTS = - - -ALL_IFILES = \ - $(LOCAL_IFILES) \ - $(NON_LOCAL_IFILES) - -NON_LOCAL_IFILES = \ - $(GNURADIO_CORE_INCLUDEDIR)/swig/gnuradio.i - -LOCAL_IFILES = \ - libvector.i \ - libvector_square_ff.i - - -# These files are built by SWIG. The first is the C++ glue. -# The second is the python wrapper that loads the _libvector shared library -# and knows how to call our extensions. - -swig_built_sources = \ - libvector_swig.cc \ - libvector_swig.py - -# This gets libvector.py installed in the right place. -# Any hand-written python should be listed here too. -ourpython_PYTHON = \ - __init__.py \ - libvector_swig.py - -# Python QA code gets listed here. It's not installed. -noinst_PYTHON = - -# This library contains the swig generated glue -ourpyexec_LTLIBRARIES = _libvector_swig.la - -# These are the source files that go into the shared library -_libvector_swig_la_SOURCES = \ - libvector_swig.cc - -# Dummy C++ source to cause C++ linking -nodist_EXTRA__libvector_swig_la_SOURCES = dummy.cc - -# magic flags -_libvector_swig_la_LDFLAGS = $(NO_UNDEFINED) -module -avoid-version - -# Link the swig generated code against our library and some python magic. -_libvector_swig_la_LIBADD = \ - ../lib/liblibvector.la \ - $(PYTHON_LDFLAGS) - - -libvector_swig.cc libvector_swig.py: $(LOCAL_IFILES) $(ALL_IFILES) - $(SWIG) $(SWIGPYTHONARGS) -module libvector_swig -o libvector_swig.cc $(srcdir)/libvector.i - - -# These swig headers get installed in ${prefix}/include//swig -ourswiginclude_HEADERS = \ - $(LOCAL_IFILES) +volkpython_PYTHON = \ + __init__.py -- cgit From 917e67d031bf5f044dfbbae3d4cf01b167b7b903 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 01:00:42 -0500 Subject: volk: Cleaning up naming from old libvector to volk. --- volk/python/__init__.py | 10 ++++----- volk/python/libvector.i | 47 --------------------------------------- volk/python/libvector_square_ff.i | 37 ------------------------------ volk/python/qa_square.py | 8 +++---- volk/python/volk.i | 47 +++++++++++++++++++++++++++++++++++++++ volk/python/volk_square_ff.i | 37 ++++++++++++++++++++++++++++++ 6 files changed, 93 insertions(+), 93 deletions(-) delete mode 100644 volk/python/libvector.i delete mode 100644 volk/python/libvector_square_ff.i create mode 100644 volk/python/volk.i create mode 100644 volk/python/volk_square_ff.i (limited to 'volk') diff --git a/volk/python/__init__.py b/volk/python/__init__.py index 6513a31dd..7239e7e23 100644 --- a/volk/python/__init__.py +++ b/volk/python/__init__.py @@ -1,5 +1,5 @@ # -# Copyright 2008 Free Software Foundation, Inc. +# Copyright 2010 Free Software Foundation, Inc. # # This file is part of GNU Radio # @@ -38,12 +38,12 @@ if _RTLD_GLOBAL != 0: # ---------------------------------------------------------------- -# import swig generated symbols into the libvector namespace -from libvector_swig import * +# import swig generated symbols into the volk namespace +from volk_swig import * # import any pure python here -# from libvector_foo import bar -# from libvector_baz import * +# from volk_foo import bar +# from volk_baz import * # ---------------------------------------------------------------- diff --git a/volk/python/libvector.i b/volk/python/libvector.i deleted file mode 100644 index 598c20a2a..000000000 --- a/volk/python/libvector.i +++ /dev/null @@ -1,47 +0,0 @@ -/* -*- c++ -*- */ -/* - * Copyright 2008 Free Software Foundation, Inc. - * - * This file is part of GNU Radio - * - * GNU Radio is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3, or (at your option) - * any later version. - * - * GNU Radio is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - */ - -%feature("autodoc", "1"); // generate python docstrings - -%include "exception.i" -%import "gnuradio.i" // the common stuff - -%{ -#include "gnuradio_swig_bug_workaround.h" // mandatory bug fix -#include -%} - -// ---------------------------------------------------------------- - -/* - * Gather all .i files in this directory together. - */ - -%{ - -// The .h files -#include - -%} - -// The .i files -%include - diff --git a/volk/python/libvector_square_ff.i b/volk/python/libvector_square_ff.i deleted file mode 100644 index a1c547cc7..000000000 --- a/volk/python/libvector_square_ff.i +++ /dev/null @@ -1,37 +0,0 @@ -/* -*- c++ -*- */ -/* - * Copyright 2008 Free Software Foundation, Inc. - * - * This file is part of GNU Radio - * - * GNU Radio is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3, or (at your option) - * any later version. - * - * GNU Radio is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - */ - -/* - * First arg is the package prefix. - * Second arg is the name of the class minus the prefix. - * - * This does some behind-the-scenes magic so we can - * access gr_example_square_ff from python as howto.square_ff - */ -GR_SWIG_BLOCK_MAGIC(libvector,square_ff); - -libvector_square_ff_sptr libvector_make_square_ff (); - -class libvector_square_ff : public gr_sync_block -{ -private: - libvector_square_ff(); -}; diff --git a/volk/python/qa_square.py b/volk/python/qa_square.py index 1ee56411f..53f0433ca 100755 --- a/volk/python/qa_square.py +++ b/volk/python/qa_square.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# Copyright 2004,2007,2008 Free Software Foundation, Inc. +# Copyright 2010 Free Software Foundation, Inc. # # This file is part of GNU Radio # @@ -21,9 +21,9 @@ # from gnuradio import gr, gr_unittest -import libvector_swig as libvector +import volk_swig as volk -class qa_libvector(gr_unittest.TestCase): +class qa_volk(gr_unittest.TestCase): def setUp(self): self.tb = gr.top_block() @@ -35,7 +35,7 @@ class qa_libvector(gr_unittest.TestCase): src_data = (-3, 4, -5.5, 2, 3) expected_result = (9, 16, 30.25, 4, 9) src = gr.vector_source_f(src_data) - sqr = libvector.square_ff() + sqr = volk.square_ff() dst = gr.vector_sink_f() self.tb.connect(src, sqr) self.tb.connect(sqr, dst) diff --git a/volk/python/volk.i b/volk/python/volk.i new file mode 100644 index 000000000..ea3a037ba --- /dev/null +++ b/volk/python/volk.i @@ -0,0 +1,47 @@ +/* -*- c++ -*- */ +/* + * Copyright 2010 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +%feature("autodoc", "1"); // generate python docstrings + +%include "exception.i" +%import "gnuradio.i" // the common stuff + +%{ +#include "gnuradio_swig_bug_workaround.h" // mandatory bug fix +#include +%} + +// ---------------------------------------------------------------- + +/* + * Gather all .i files in this directory together. + */ + +%{ + +// The .h files +#include + +%} + +// The .i files +%include + diff --git a/volk/python/volk_square_ff.i b/volk/python/volk_square_ff.i new file mode 100644 index 000000000..1d485bd7e --- /dev/null +++ b/volk/python/volk_square_ff.i @@ -0,0 +1,37 @@ +/* -*- c++ -*- */ +/* + * Copyright 2010 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * First arg is the package prefix. + * Second arg is the name of the class minus the prefix. + * + * This does some behind-the-scenes magic so we can + * access gr_example_square_ff from python as howto.square_ff + */ +GR_SWIG_BLOCK_MAGIC(volk,square_ff); + +volk_square_ff_sptr volk_make_square_ff (); + +class volk_square_ff : public gr_sync_block +{ +private: + volk_square_ff(); +}; -- cgit From f8b0c86d8a9eb347cb7187e3b01ed46c66de6a64 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 01:09:35 -0500 Subject: volk: Adding gitignore files. --- volk/.gitignore | 43 +++++++++++++++++++++++++++++++++++++++++++ volk/config/.gitignore | 16 ++++++++++++++++ volk/include/.gitignore | 10 ++++++++++ volk/include/volk/.gitignore | 19 +++++++++++++++++++ volk/lib/.gitignore | 21 +++++++++++++++++++++ 5 files changed, 109 insertions(+) create mode 100644 volk/.gitignore create mode 100644 volk/config/.gitignore create mode 100644 volk/include/.gitignore create mode 100644 volk/include/volk/.gitignore create mode 100644 volk/lib/.gitignore (limited to 'volk') diff --git a/volk/.gitignore b/volk/.gitignore new file mode 100644 index 000000000..dcd421895 --- /dev/null +++ b/volk/.gitignore @@ -0,0 +1,43 @@ +.* +*.o +*.a +*.ko +*.so +*.la +*.lo +*.py[oc] +*.gz +*.exe +*.patch +*~ +\#*# +.deps +.libs +TAGS +*-stamp +!.gitattributes +!.gitignore +make.log +/configure +/Makefile.in +/config.log +/config.h +/ltmain.sh +/Makefile +/config.status +/stamp-h1 +/stamp-h2 +/config.h.in +/autom4te.cache +/libtool +/missing +/aclocal.m4 +/install-sh +/depcomp +/py-compile +/compile +/build +/run_tests.sh +/volk.pc +/volk_config.h +/volk_config.h.in diff --git a/volk/config/.gitignore b/volk/config/.gitignore new file mode 100644 index 000000000..6330cd1b6 --- /dev/null +++ b/volk/config/.gitignore @@ -0,0 +1,16 @@ +*.cache +/*.la +/*.lo +/*.pc +/.deps +/.la +/.libs +/.lo +/Makefile +/Makefile.in +/libtool.m4 +/ltoptions.m4 +/ltsugar.m4 +/ltversion.m4 +/lt~obsolete.m4 +/lv_set_simd_flags.m4 diff --git a/volk/include/.gitignore b/volk/include/.gitignore new file mode 100644 index 000000000..378f771f5 --- /dev/null +++ b/volk/include/.gitignore @@ -0,0 +1,10 @@ +/*.cache +/*.la +/*.lo +/*.pc +/.deps +/.la +/.libs +/.lo +/Makefile +/Makefile.in diff --git a/volk/include/volk/.gitignore b/volk/include/volk/.gitignore new file mode 100644 index 000000000..ae4824cc3 --- /dev/null +++ b/volk/include/volk/.gitignore @@ -0,0 +1,19 @@ +/*.cache +/*.la +/*.lo +/*.pc +/.deps +/.la +/.libs +/.lo +/Makefile +/Makefile.in +/volk.h +/volk_config.h +/volk_config_fixed.h +/volk_cpu.h +/volk_environment_init.h +/volk_registry.h +/volk_runtime.h +/volk_tables.h +/volk_typedefs.h diff --git a/volk/lib/.gitignore b/volk/lib/.gitignore new file mode 100644 index 000000000..573fb1618 --- /dev/null +++ b/volk/lib/.gitignore @@ -0,0 +1,21 @@ +/*.cache +/*.la +/*.lo +/*.pc +/.deps +/.la +/.libs +/.lo +/Makefile +/Makefile.in +/volk.c +/volk_cpu_generic.c +/volk_cpu_powerpc.c +/volk_cpu_x86.c +/volk_environment_init.c +/volk_init.c +/volk_init.h +/volk_mktables +/volk_mktables.c +/volk_proccpu_sim.c +/volk_runtime.c -- cgit From 81d722595385eb4b2ef09cb76db1d4070fc11324 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 01:10:10 -0500 Subject: volk: adding config.guess and config.sub to project distribution. --- volk/config.guess | 1 + volk/config.sub | 1 + 2 files changed, 2 insertions(+) create mode 120000 volk/config.guess create mode 120000 volk/config.sub (limited to 'volk') diff --git a/volk/config.guess b/volk/config.guess new file mode 120000 index 000000000..405bc3235 --- /dev/null +++ b/volk/config.guess @@ -0,0 +1 @@ +/usr/share/automake-1.11/config.guess \ No newline at end of file diff --git a/volk/config.sub b/volk/config.sub new file mode 120000 index 000000000..4d47fbcbc --- /dev/null +++ b/volk/config.sub @@ -0,0 +1 @@ +/usr/share/automake-1.11/config.sub \ No newline at end of file -- cgit From 74f206edb2c7bfbe010b5a5cbc5fe2f07965c3a6 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 01:29:58 -0500 Subject: volk: Fixing makefiles for dist. Distcheck still failing on other issues now. --- volk/config/Makefile.am | 4 ---- volk/include/volk/Makefile.am | 1 + volk/lib/Makefile.am | 5 +++-- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'volk') diff --git a/volk/config/Makefile.am b/volk/config/Makefile.am index b7dc2c161..0e556c6e2 100644 --- a/volk/config/Makefile.am +++ b/volk/config/Makefile.am @@ -29,7 +29,6 @@ m4macros = \ acx_pthread.m4 \ bnv_have_qt.m4 \ cppunit.m4 \ - gr_check_shm_open.m4 \ gr_lib64.m4 \ gr_libgnuradio_core_extra_ldflags.m4 \ gr_no_undefined.m4 \ @@ -37,15 +36,12 @@ m4macros = \ gr_pwin32.m4 \ gr_set_md_cpu.m4 \ lv_configure.m4 \ - gr_subversion.m4 \ gr_sysv_shm.m4 \ lf_cc.m4 \ lf_cxx.m4 \ lf_warnings.m4 \ - lv_lf_warnings.m4 \ lf_x11.m4 \ lv_set_simd_flags.m4 \ - lv_set_lv_arch.m4 \ mkstemp.m4 \ onceonly.m4 \ pkg.m4 \ diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 94c78ffff..9894fdb99 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -30,6 +30,7 @@ volkinclude_HEADERS = \ volk_config.h \ volk_tables.h \ volk_typedefs.h \ + volk_registry.h \ volk.h \ volk_cpu.h \ volk_environment_init.h \ diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 97eb75680..54df42d54 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -61,7 +61,7 @@ universal_CODE = \ volk_environment_init.c generic_CODE = \ - volk_cpu_generic.cc + volk_cpu_generic.c x86_CODE = \ volk_cpu_x86.c @@ -73,7 +73,7 @@ x86_64_SUBCODE = \ cpuid_x86_64.S powerpc_CODE = \ - volk_cpu_powerpc.cc + volk_cpu_powerpc.c if MD_CPU_generic @@ -236,6 +236,7 @@ libvolk_qa_la_LIBADD = \ noinst_HEADERS = \ volk_init.h \ qa_volk.h \ + assembly.h \ qa_16s_quad_max_star_aligned16.h \ qa_32fc_dot_prod_aligned16.h \ qa_32fc_square_dist_aligned16.h \ -- cgit From 46d55649012e4fb2838a6f8e9f3c9226ea8b2d50 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 12:19:28 -0500 Subject: volk: Working on VPATH build issues. Makes it through configure, fails on make. --- volk/Makefile.am | 2 +- volk/configure.ac | 2 +- volk/include/volk/make_set_simd.py | 1 - volk/lib/Makefile.am | 1 + volk/system_cleanup.sh | 10 ++++++---- 5 files changed, 9 insertions(+), 7 deletions(-) (limited to 'volk') diff --git a/volk/Makefile.am b/volk/Makefile.am index d2ef3dad4..19725e8ab 100644 --- a/volk/Makefile.am +++ b/volk/Makefile.am @@ -23,7 +23,7 @@ ACLOCAL_AMFLAGS = -I config include $(top_srcdir)/Makefile.common -EXTRA_DIST = bootstrap configure config.h.in +EXTRA_DIST = bootstrap configure config.h.in volk_config.h system_cleanup.sh SUBDIRS = config include lib #if USE_PYTHON #SUBDIRS += python diff --git a/volk/configure.ac b/volk/configure.ac index 4c095cbd5..96d4f4de7 100644 --- a/volk/configure.ac +++ b/volk/configure.ac @@ -72,6 +72,6 @@ AC_CONFIG_FILES([\ volk.pc \ ]) -AC_CONFIG_COMMANDS([run_system_cleanup], [chmod +x system_cleanup.sh && ./system_cleanup.sh $MYCPU $MYSUBCPU], [MYCPU=$MD_CPU MYSUBCPU=$MD_SUBCPU]) +AC_CONFIG_COMMANDS([run_system_cleanup], [chmod +x $srcdir/system_cleanup.sh && $srcdir/system_cleanup.sh $MYCPU $MYSUBCPU $srcdir], [MYCPU=$MD_CPU MYSUBCPU=$MD_SUBCPU]) AC_OUTPUT diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index e320bc748..c35aa06fa 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -98,7 +98,6 @@ def make_set_simd(dom) : tempstring = tempstring + " cf_with_lv_arch=$lv_PROCCPU\n"; tempstring = tempstring + " fi\n"; - tempstring = tempstring + " echo $cf_with_lv_arch\n"; for domarch in dom: if str(domarch.attributes["type"].value) != "all": arch = str(domarch.attributes["name"].value); diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 54df42d54..4ee934e8b 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -46,6 +46,7 @@ lib_LTLIBRARIES = \ libvolk_runtime.la \ libvolk_qa.la +EXTRA_DIST = volk_mktables.c # ---------------------------------------------------------------- # The main library diff --git a/volk/system_cleanup.sh b/volk/system_cleanup.sh index 7ebd2caf8..5be9494f4 100755 --- a/volk/system_cleanup.sh +++ b/volk/system_cleanup.sh @@ -2,26 +2,28 @@ cp volk_config.h include/volk/ cd lib +srcdir=../$3 + case $1 in (x86) case $2 in (x86_64) - gcc -o volk_mktables -I../include -I. volk_cpu_x86.c cpuid_x86_64.S volk_rank_archs.c volk_mktables.c + gcc -o volk_mktables -I$srcdir/include -I$srcdir/lib -I../include -I. $srcdir/lib/volk_cpu_x86.c $srcdir/lib/cpuid_x86_64.S $srcdir/lib/volk_rank_archs.c $srcdir/lib/volk_mktables.c ./volk_mktables ;; (*) - gcc -o volk_mktables -I../include -I. volk_cpu_x86.c cpuid_x86.S volk_rank_archs.c volk_mktables.c + gcc -o volk_mktables -I$srcdir/include -I$srcdir/lib -I../include -I. $srcdir/lib/volk_cpu_x86.c $srcdir/lib/cpuid_x86.S $srcdir/lib/volk_rank_archs.c $srcdir/lib/volk_mktables.c ./volk_mktables ;; esac ;; (powerpc) - gcc -o volk_mktables -I../include -I. volk_cpu_powerpc.c volk_rank_archs.c volk_mktables.c + gcc -o volk_mktables -I$srcdir/include -I$srcdir/lib -I../include -I. $srcdir/lib/volk_cpu_powerpc.c $srcdir/lib/volk_rank_archs.c $srcdir/lib/volk_mktables.c ./volk_mktables ;; (*) - gcc -o volk_mktables -I../include -I. volk_cpu_generic.c volk_rank_archs.c volk_mktables.c + gcc -o volk_mktables -I$srcdir/include -I$srcdir/lib -I../include -I. $srcdir/lib/volk_cpu_generic.c $srcdir/lib/volk_rank_archs.c $srcdir/lib/volk_mktables.c ./volk_mktables ;; esac -- cgit From 1cc88091470dd4654b6936cda92d81841e135209 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 17:00:38 -0500 Subject: volk: more changes to build system so that VPATH builds properly and project makes distcheck. --- volk/config/lv_configure.m4 | 2 +- volk/include/volk/make_set_simd.py | 8 ++++---- volk/lib/Makefile.am | 11 ++++++++--- 3 files changed, 13 insertions(+), 8 deletions(-) (limited to 'volk') diff --git a/volk/config/lv_configure.m4 b/volk/config/lv_configure.m4 index efcec5d3e..c7a5fe960 100644 --- a/volk/config/lv_configure.m4 +++ b/volk/config/lv_configure.m4 @@ -111,7 +111,7 @@ dnl AM_CONDITIONAL([USE_PYTHON], [test "$with_python" = yes]) PKG_CHECK_EXISTS(cppunit, [PKG_CHECK_MODULES(CPPUNIT, cppunit >= 1.9.14)], [AM_PATH_CPPUNIT([1.9.14],[], - [AC_MSG_ERROR([LIBVECTOR requires cppunit. Stop])])]) + [AC_MSG_ERROR([VOLK requires cppunit. Stop])])]) dnl PKG_CHECK_MODULES(GNURADIO_CORE, gnuradio-core >= 3) dnl LIBS="$LIBS $GNURADIO_CORE_LIBS" diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index c35aa06fa..03c2b2d8f 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -12,7 +12,7 @@ def make_set_simd(dom) : tempstring = tempstring + " (x86)\n"; tempstring = tempstring + " case \"$MD_SUBCPU\" in\n"; tempstring = tempstring + " (x86)\n"; - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ -I./lib lib/volk_proccpu_sim.c lib/volk_cpu_x86.c lib/cpuid_x86.S 2>&1`\"\n"; + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I $srcdir/include/ -I$srcdir/lib $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_x86.c $srcdir/lib/cpuid_x86.S`\"\n"; tempstring = tempstring + " then\n"; tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; @@ -23,7 +23,7 @@ def make_set_simd(dom) : tempstring = tempstring + " fi\n" tempstring = tempstring + " ;;\n" tempstring = tempstring + " (*)\n" - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ -I./lib lib/volk_proccpu_sim.c lib/volk_cpu_x86.c lib/cpuid_x86_64.S 2>&1`\"\n"; + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ -I$srcdir/lib $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_x86.c $srcdir/lib/cpuid_x86_64.S`\"\n"; tempstring = tempstring + " then\n"; tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; @@ -36,7 +36,7 @@ def make_set_simd(dom) : tempstring = tempstring + " esac\n" tempstring = tempstring + " ;;\n"; tempstring = tempstring + " (powerpc)\n"; - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ lib/volk_proccpu_sim.c lib/volk_cpu_powerpc.c 2>&1`\"\n"; + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_powerpc.c 2>&1`\"\n"; tempstring = tempstring + " then\n"; tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; @@ -47,7 +47,7 @@ def make_set_simd(dom) : tempstring = tempstring + " fi\n" tempstring = tempstring + " ;;\n"; tempstring = tempstring + " (*)\n"; - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I ./include/ lib/volk_proccpu_sim.c lib/volk_cpu_generic.c 2>&1`\"\n"; + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_generic.c 2>&1`\"\n"; tempstring = tempstring + " then\n"; tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 4ee934e8b..7e808695f 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright 2008 Free Software Foundation, Inc. +# Copyright 2010 Free Software Foundation, Inc. # # This file is part of GNU Radio # @@ -20,7 +20,9 @@ include $(top_srcdir)/Makefile.common -AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) $(LV_CXXFLAGS) +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ + -I$(top_builddir)/include \ + $(LV_CXXFLAGS) $(WITH_INCLUDES) # We build 2 libraries and 1 executable here. One library contains @@ -46,7 +48,10 @@ lib_LTLIBRARIES = \ libvolk_runtime.la \ libvolk_qa.la -EXTRA_DIST = volk_mktables.c +EXTRA_DIST = \ + volk_mktables.c \ + volk_rank_archs.h \ + volk_proccpu_sim.c # ---------------------------------------------------------------- # The main library -- cgit From a8f33e1b577342fd8149d9308d474871c44c7d52 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 17:26:40 -0500 Subject: Removing autotests of volk during make check and distchecks since they take a long time to run. These can be run by hand by executing volk/lib/test_all Also made a comment about needing a possible fix for this makefile. --- volk/lib/Makefile.am | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'volk') diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 7e808695f..a95860d11 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -20,6 +20,10 @@ include $(top_srcdir)/Makefile.common +#FIXME: forcing the top_builddir for distcheck seems like a bit +# of a hack. Figure out the right way to do this to find built +# volk_config.h and volk_tables.h + AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ -I$(top_builddir)/include \ $(LV_CXXFLAGS) $(WITH_INCLUDES) @@ -40,7 +44,7 @@ AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ # list of programs run by "make check" and "make distcheck" -TESTS = test_all +#TESTS = test_all lib_LTLIBRARIES = \ -- cgit From 63f520734e4e112da32f0dc5a59107c1397199ae Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 8 Dec 2010 17:58:46 -0500 Subject: volk: Changing version number to 0.1. --- volk/configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'volk') diff --git a/volk/configure.ac b/volk/configure.ac index 96d4f4de7..84c6bbce0 100644 --- a/volk/configure.ac +++ b/volk/configure.ac @@ -20,7 +20,7 @@ AC_PREREQ(2.57) AC_CONFIG_AUX_DIR([.]) AC_CONFIG_SRCDIR([lib/test_all.cc]) AM_CONFIG_HEADER(config.h) -AM_INIT_AUTOMAKE(volk,0.0svn) +AM_INIT_AUTOMAKE(volk,0.1) dnl This is kind of non-standard, but it sure shortens up this file :-) -- cgit From 79ca8f230d0f0758656e32a2bb0317f03e3317d0 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 9 Dec 2010 16:52:33 -0500 Subject: volk: remove redundant warning message at top of volk.c file. --- volk/include/volk/make_c.py | 2 -- 1 file changed, 2 deletions(-) (limited to 'volk') diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index 08d6d949c..f2432d7a4 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -5,8 +5,6 @@ from volk_regexp import * def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : tempstring = ""; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; - tempstring = tempstring + '/*this file is auto generated by volk_register.py*/'; tempstring = tempstring + '\n\n#include\n'; tempstring = tempstring + '#include\n'; -- cgit From 97ff51156b492a1e52420a1f12cd75eccf2235ac Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 9 Dec 2010 16:53:06 -0500 Subject: volk: Removing erroneous SSE function that actually usese an SSE3 intrin (mm_hadd_ps). --- .../volk/volk_16sc_magnitude_32f_aligned16.h | 66 ---------------------- 1 file changed, 66 deletions(-) (limited to 'volk') diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h index 29c58ceb8..8ba5737e8 100644 --- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h +++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h @@ -70,72 +70,6 @@ static inline void volk_16sc_magnitude_32f_aligned16_sse3(float* magnitudeVector } #endif /* LV_HAVE_SSE3 */ -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - - __m128 cplxValue1, cplxValue2, result; - - float inputFloatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - _mm_store_ps(magnitudeVectorPtr, result); - - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) * iScalar; - float val1Imag = (float)(*complexVectorPtr++) * iScalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE */ - #if LV_HAVE_GENERIC /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector -- cgit From 87889c20c7551b39bb508408035c894dfc32fb0d Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 9 Dec 2010 17:09:23 -0500 Subject: volk: modified the configure scripts to output which architectures it will be building based on the configure tests. --- volk/configure.ac | 6 ++++++ volk/include/volk/make_set_simd.py | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'volk') diff --git a/volk/configure.ac b/volk/configure.ac index 84c6bbce0..2051064f6 100644 --- a/volk/configure.ac +++ b/volk/configure.ac @@ -75,3 +75,9 @@ AC_CONFIG_FILES([\ AC_CONFIG_COMMANDS([run_system_cleanup], [chmod +x $srcdir/system_cleanup.sh && $srcdir/system_cleanup.sh $MYCPU $MYSUBCPU $srcdir], [MYCPU=$MD_CPU MYSUBCPU=$MD_SUBCPU]) AC_OUTPUT + +echo "" +echo "The following architectures will be built:" +echo " $BUILT_ARCHS" +echo "" + diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index 03c2b2d8f..279ce06e2 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -1,3 +1,20 @@ +# +# Copyright 2010 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + from xml.dom import minidom def make_set_simd(dom) : @@ -93,6 +110,7 @@ def make_set_simd(dom) : arch = str(domarch.attributes["name"].value); tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; tempstring = tempstring + " ADDONS=\"\"\n"; + tempstring = tempstring + " BUILT_ARCHS=\"generic\"\n"; tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n"; tempstring = tempstring + " cf_with_lv_arch=$lv_PROCCPU\n"; @@ -138,6 +156,7 @@ def make_set_simd(dom) : tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\""; tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " fi\n" tempstring = tempstring + " indCC=no\n" @@ -182,6 +201,7 @@ def make_set_simd(dom) : tempstring = tempstring + " if test \"$indCC\" = yes && test \"indCXX\" = yes && \"indLV_ARCH\" = yes; then\n" tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\""; tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " fi\n" tempstring = tempstring + " indCC=no\n" -- cgit From f3c684751dc3da3a06d5960d8b961739bdf0fd12 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 9 Dec 2010 17:34:29 -0500 Subject: volk: adding generic QA test for 16sc_magnitude_32f. --- volk/lib/qa_16sc_magnitude_32f_aligned16.cc | 42 ++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) (limited to 'volk') diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc index 06dff2fd5..2c9e48f6e 100644 --- a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc +++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc @@ -8,7 +8,47 @@ #ifndef LV_HAVE_SSE3 void qa_16sc_magnitude_32f_aligned16::t1() { - printf("sse3 not available... no test performed\n"); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + std::complex input0[vlen] __attribute__ ((aligned (16))); + + float output_generic[vlen] __attribute__ ((aligned (16))); + float output_known[vlen] __attribute__ ((aligned (16))); + + int16_t* inputLoad = (int16_t*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (int16_t)(rand() - (RAND_MAX/2)); + } + printf("16sc_magnitude_32f_aligned\n"); + + float scale = 32768.0; + for(int i = 0; i < vlen; ++i) { + float re = (float)(input0[i].real())/scale; + float im = (float)(input0[i].imag())/scale; + output_known[i] = sqrt(re*re + im*im); + } + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_32f_aligned16_manual(output_generic, input0, scale, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + /* + for(int i = 0; i < 100; ++i) { + printf("inputs: %d + j%d\n", input0[i].real(), input0[i].imag()); + printf("generic... %f == %f\n", output_generic[i], output_known[i]); + } + */ + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_known[i], fabs(output_generic[i])*1e-4); + } } #else -- cgit From 31c85c66f38ed304db06e0696b3df1d2407378c8 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 9 Dec 2010 17:53:05 -0500 Subject: volk: Adding a few more generic-only test cases. --- volk/lib/qa_32f_add_aligned16.cc | 55 ++++++++++++++++++++++++++++++++++- volk/lib/qa_32f_divide_aligned16.cc | 55 ++++++++++++++++++++++++++++++++++- volk/lib/qa_32f_multiply_aligned16.cc | 55 ++++++++++++++++++++++++++++++++++- volk/lib/qa_32f_sqrt_aligned16.cc | 53 +++++++++++++++++++++++++++++++++ 4 files changed, 215 insertions(+), 3 deletions(-) (limited to 'volk') diff --git a/volk/lib/qa_32f_add_aligned16.cc b/volk/lib/qa_32f_add_aligned16.cc index 92f35c7ec..002aebfc9 100644 --- a/volk/lib/qa_32f_add_aligned16.cc +++ b/volk/lib/qa_32f_add_aligned16.cc @@ -1,3 +1,22 @@ +/* -*- c++ -*- */ +/* + * Copyright 2010 Free Software Foundation, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, see + * . + */ + #include #include #include @@ -8,7 +27,41 @@ #ifndef LV_HAVE_SSE void qa_32f_add_aligned16::t1() { - printf("sse not available... no test performed\n"); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output_known[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + output_known[i] = input0[i] + input1[i]; + } + printf("32f_add_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_add_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + /* + for(int i = 0; i < 10; ++i) { + printf("inputs: %f, %f\n", input0[i], input1[i]); + printf("generic... %f == %f\n", output0[i], output_known[i]); + } + */ + + for(int i = 0; i < vlen; ++i) { + CPPUNIT_ASSERT_EQUAL(output0[i], output_known[i]); + } } #else diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc index b20999beb..8826bf94f 100644 --- a/volk/lib/qa_32f_divide_aligned16.cc +++ b/volk/lib/qa_32f_divide_aligned16.cc @@ -1,3 +1,22 @@ +/* -*- c++ -*- */ +/* + * Copyright 2010 Free Software Foundation, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, see + * . + */ + #include #include #include @@ -8,7 +27,41 @@ #ifndef LV_HAVE_SSE void qa_32f_divide_aligned16::t1() { - printf("sse not available... no test performed\n"); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output_known[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + output_known[i] = input0[i] / input1[i]; + } + printf("32f_divide_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_divide_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + /* + for(int i = 0; i < 10; ++i) { + printf("inputs: %f, %f\n", input0[i], input1[i]); + printf("generic... %f == %f\n", output0[i], output_known[i]); + } + */ + + for(int i = 0; i < vlen; ++i) { + CPPUNIT_ASSERT_EQUAL(output0[i], output_known[i]); + } } #else diff --git a/volk/lib/qa_32f_multiply_aligned16.cc b/volk/lib/qa_32f_multiply_aligned16.cc index c77fe97da..e52748466 100644 --- a/volk/lib/qa_32f_multiply_aligned16.cc +++ b/volk/lib/qa_32f_multiply_aligned16.cc @@ -1,3 +1,22 @@ +/* -*- c++ -*- */ +/* + * Copyright 2010 Free Software Foundation, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, see + * . + */ + #include #include #include @@ -8,7 +27,41 @@ #ifndef LV_HAVE_SSE void qa_32f_multiply_aligned16::t1() { - printf("sse not available... no test performed\n"); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + float input0[vlen] __attribute__ ((aligned (16))); + float input1[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output_known[vlen] __attribute__ ((aligned (16))); + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + output_known[i] = input0[i] * input1[i]; + } + printf("32f_multiply_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_multiply_aligned16_manual(output0, input0, input1, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + /* + for(int i = 0; i < 10; ++i) { + printf("inputs: %f, %f\n", input0[i], input1[i]); + printf("generic... %f == %f\n", output0[i], output_known[i]); + } + */ + + for(int i = 0; i < vlen; ++i) { + CPPUNIT_ASSERT_EQUAL(output0[i], output_known[i]); + } } #else diff --git a/volk/lib/qa_32f_sqrt_aligned16.cc b/volk/lib/qa_32f_sqrt_aligned16.cc index a3e6abc18..9a5f71de0 100644 --- a/volk/lib/qa_32f_sqrt_aligned16.cc +++ b/volk/lib/qa_32f_sqrt_aligned16.cc @@ -1,3 +1,22 @@ +/* -*- c++ -*- */ +/* + * Copyright 2010 Free Software Foundation, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, see + * . + */ + #include #include #include @@ -9,6 +28,40 @@ void qa_32f_sqrt_aligned16::t1() { printf("sse not available... no test performed\n"); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + float input0[vlen] __attribute__ ((aligned (16))); + + float output0[vlen] __attribute__ ((aligned (16))); + float output_known[vlen] __attribute__ ((aligned (16))); + + // No reason to test negative numbers because they result in NaN. + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand()) / static_cast(RAND_MAX)); + output_known[i] = sqrt(input0[i]); + } + printf("32f_sqrt_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + + /* + for(int i = 0; i < 10; ++i) { + printf("inputs: %f\n", input0[i]); + printf("generic... %f == %f\n", output0[i], output_known[i]); + } + */ + + for(int i = 0; i < vlen; ++i) { + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output_known[i], fabs(output0[i])*1e-4); + } } #else -- cgit From 61d7638304db3c999bd43808fce0e531996dba87 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 9 Dec 2010 20:24:21 -0500 Subject: volk: readding 16sc_magnitude_32f_sse with fix for SSE hadd_ps error. --- .../volk/volk_16sc_magnitude_32f_aligned16.h | 70 ++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'volk') diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h index 8ba5737e8..9c2a48835 100644 --- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h +++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h @@ -70,6 +70,76 @@ static inline void volk_16sc_magnitude_32f_aligned16_sse3(float* magnitudeVector } #endif /* LV_HAVE_SSE3 */ +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + + __m128 cplxValue1, cplxValue2, result, re, im; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); + im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(re, invScalar); + cplxValue2 = _mm_mul_ps(im, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) * iScalar; + float val1Imag = (float)(*complexVectorPtr++) * iScalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} + + +#endif /* LV_HAVE_SSE */ + #if LV_HAVE_GENERIC /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector -- cgit From 8375fd6ca2f6e5edb923abe0d6341b6d4d2d1aae Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 10 Dec 2010 01:48:17 -0500 Subject: volk: Fixing build system to handle making volk_mktables, volk_tables.h, and volk_config.h instead of a standalone shell script. --- volk/configure.ac | 9 ++++++-- volk/include/volk/Makefile.am | 48 ++++++++++++++++++++++++++++++++++++++++++- volk/lib/Makefile.am | 1 - 3 files changed, 54 insertions(+), 4 deletions(-) (limited to 'volk') diff --git a/volk/configure.ac b/volk/configure.ac index 2051064f6..8f17e5065 100644 --- a/volk/configure.ac +++ b/volk/configure.ac @@ -63,6 +63,13 @@ dnl AX_BOOST_WSERIALIZATION AC_CONFIG_HEADERS([volk_config.h]) LV_SET_SIMD_FLAGS +# FIXME: Not very extensible to supporting more processors easily +AM_CONDITIONAL([MYCPU_X86], [test "$MD_CPU" = "x86"]) +AM_CONDITIONAL([MYSUBCPU_X86], [test "$MD_SUBCPU" = "x86"]) +AM_CONDITIONAL([MYSUBCPU_X86_64], [test "$MD_SUBCPU" = "x86_64"]) +AM_CONDITIONAL([MYSUBCPU_POWERPC], [test "$MD_SUBCPU" = "powerpc"]) +AM_CONDITIONAL([MYSUBCPU_GENERIC], [test "$MD_SUBCPU" != "powerpc" && test "$MD_SUBCPU" != "x86" && test "$MD_SUBCPU" != "x86_64"]) + AC_CONFIG_FILES([\ Makefile \ config/Makefile \ @@ -72,8 +79,6 @@ AC_CONFIG_FILES([\ volk.pc \ ]) -AC_CONFIG_COMMANDS([run_system_cleanup], [chmod +x $srcdir/system_cleanup.sh && $srcdir/system_cleanup.sh $MYCPU $MYSUBCPU $srcdir], [MYCPU=$MD_CPU MYSUBCPU=$MD_SUBCPU]) - AC_OUTPUT echo "" diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 9894fdb99..256cc8420 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -20,8 +20,15 @@ include $(top_srcdir)/Makefile.common +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ + $(LV_CXXFLAGS) $(WITH_INCLUDES) + volkincludedir = $(prefix)/include/volk +BUILT_SOURCES: \ + volk_config.h \ + volk_tables.h + volkinclude_HEADERS = \ volk_complex.h \ volk_common.h \ @@ -122,6 +129,43 @@ volkinclude_HEADERS = \ volk_8s_convert_32f_aligned16.h \ volk_8s_convert_32f_unaligned16.h +VOLK_MKTABLES_SOURCES = \ + $(abs_top_srcdir)/lib/volk_rank_archs.c \ + $(abs_top_srcdir)/lib/volk_mktables.c + + +# FIXME: Not very extensible to supporting more processors easily +if MYSUBCPU_X86_64 + VOLK_MKTABLES_SOURCES += \ + $(abs_top_srcdir)/lib/volk_cpu_x86.c \ + $(abs_top_srcdir)/lib/cpuid_x86_64.S +endif + +if MYSUBCPU_X86 + VOLK_MKTABLES_SOURCES += \ + $(abs_top_srcdir)/lib/volk_cpu_x86.c \ + $(abs_top_srcdir)/lib/cpuid_x86.S +endif + +if MYSUBCPU_POWERPC + VOLK_MKTABLES_SOURCES += \ + $(abs_top_srcdir)/lib/volk_cpu_powerpc.c +endif + +if MYSUBCPU_GENERIC + VOLK_MKTABLES_SOURCES += \ + $(abs_top_srcdir)/lib/volk_cpu_generic.c +endif + +volk_mktables$(EXEEXT): $(VOLK_MKTABLES_SOURCES) + $(CC) -o $@ $^ $(AM_CPPFLAGS) + +volk_tables.h: volk_mktables$(EXEEXT) + ./volk_mktables$(EXEEXT) + +volk_config.h: $(abs_top_builddir)/volk_config.h + cp $^ $(abs_top_builddir)/include/volk/$@ + distclean-local: rm -f volk_config_fixed.h rm -f volk_config.h @@ -133,4 +177,6 @@ distclean-local: rm -f volk_tables.h rm -f *.pyc rm -f Makefile.in - rm -f volk_environment_init.h \ No newline at end of file + rm -f volk_environment_init.h + rm -f volk_mktables + rm -f $(BUILT_SOURCES) \ No newline at end of file diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index a95860d11..814d438fd 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -353,7 +353,6 @@ distclean-local: rm -f volk_cpu_x86.c rm -f volk_init.c rm -f volk_init.h - rm -f volk_mktables rm -f volk_mktables.c rm -f volk_proccpu_sim.c rm -f volk_runtime.c -- cgit From 5ccd0554a51cc508a091bf27cd1bebc25c7b2bf9 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 10 Dec 2010 01:51:55 -0500 Subject: volk: Removing unnecessary shell script; last commit takes care if its functions. --- volk/Makefile.am | 2 +- volk/system_cleanup.sh | 31 ------------------------------- 2 files changed, 1 insertion(+), 32 deletions(-) delete mode 100755 volk/system_cleanup.sh (limited to 'volk') diff --git a/volk/Makefile.am b/volk/Makefile.am index 19725e8ab..3521dd0e4 100644 --- a/volk/Makefile.am +++ b/volk/Makefile.am @@ -23,7 +23,7 @@ ACLOCAL_AMFLAGS = -I config include $(top_srcdir)/Makefile.common -EXTRA_DIST = bootstrap configure config.h.in volk_config.h system_cleanup.sh +EXTRA_DIST = bootstrap configure config.h.in volk_config.h SUBDIRS = config include lib #if USE_PYTHON #SUBDIRS += python diff --git a/volk/system_cleanup.sh b/volk/system_cleanup.sh deleted file mode 100755 index 5be9494f4..000000000 --- a/volk/system_cleanup.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -cp volk_config.h include/volk/ -cd lib -srcdir=../$3 - -case $1 in - (x86) - - case $2 in - (x86_64) - gcc -o volk_mktables -I$srcdir/include -I$srcdir/lib -I../include -I. $srcdir/lib/volk_cpu_x86.c $srcdir/lib/cpuid_x86_64.S $srcdir/lib/volk_rank_archs.c $srcdir/lib/volk_mktables.c - ./volk_mktables - ;; - (*) - gcc -o volk_mktables -I$srcdir/include -I$srcdir/lib -I../include -I. $srcdir/lib/volk_cpu_x86.c $srcdir/lib/cpuid_x86.S $srcdir/lib/volk_rank_archs.c $srcdir/lib/volk_mktables.c - ./volk_mktables - ;; - esac - ;; - (powerpc) - gcc -o volk_mktables -I$srcdir/include -I$srcdir/lib -I../include -I. $srcdir/lib/volk_cpu_powerpc.c $srcdir/lib/volk_rank_archs.c $srcdir/lib/volk_mktables.c - ./volk_mktables - ;; - (*) - gcc -o volk_mktables -I$srcdir/include -I$srcdir/lib -I../include -I. $srcdir/lib/volk_cpu_generic.c $srcdir/lib/volk_rank_archs.c $srcdir/lib/volk_mktables.c - ./volk_mktables - ;; -esac -mv volk_tables.h ../include/volk/ - -- cgit From e7aa093f30c48c3c85abf446e24bc1cf144dc3e0 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 10 Dec 2010 02:22:20 -0500 Subject: volk: May be a hack, but it was required for my 32-bit Fedora 13 to work. --- volk/include/volk/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'volk') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 256cc8420..9cf783c0f 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -158,7 +158,7 @@ if MYSUBCPU_GENERIC endif volk_mktables$(EXEEXT): $(VOLK_MKTABLES_SOURCES) - $(CC) -o $@ $^ $(AM_CPPFLAGS) + $(CC) -o $@ $^ $(AM_CPPFLAGS) -I$(abs_top_builddir)/include volk_tables.h: volk_mktables$(EXEEXT) ./volk_mktables$(EXEEXT) -- cgit From 88a4f3fcddb5a082caa39fe800ea893aface1280 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sat, 11 Dec 2010 13:35:00 -0500 Subject: volk: changing the path variables again. This works on my various systems tested. Using abs_ path names failed on Ubuntu 8.04 32-bit. --- volk/include/volk/Makefile.am | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'volk') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 9cf783c0f..04a43bd34 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -130,41 +130,41 @@ volkinclude_HEADERS = \ volk_8s_convert_32f_unaligned16.h VOLK_MKTABLES_SOURCES = \ - $(abs_top_srcdir)/lib/volk_rank_archs.c \ - $(abs_top_srcdir)/lib/volk_mktables.c + $(top_srcdir)/lib/volk_rank_archs.c \ + $(top_srcdir)/lib/volk_mktables.c # FIXME: Not very extensible to supporting more processors easily if MYSUBCPU_X86_64 VOLK_MKTABLES_SOURCES += \ - $(abs_top_srcdir)/lib/volk_cpu_x86.c \ - $(abs_top_srcdir)/lib/cpuid_x86_64.S + $(top_srcdir)/lib/volk_cpu_x86.c \ + $(top_srcdir)/lib/cpuid_x86_64.S endif if MYSUBCPU_X86 VOLK_MKTABLES_SOURCES += \ - $(abs_top_srcdir)/lib/volk_cpu_x86.c \ - $(abs_top_srcdir)/lib/cpuid_x86.S + $(top_srcdir)/lib/volk_cpu_x86.c \ + $(top_srcdir)/lib/cpuid_x86.S endif if MYSUBCPU_POWERPC VOLK_MKTABLES_SOURCES += \ - $(abs_top_srcdir)/lib/volk_cpu_powerpc.c + $(top_srcdir)/lib/volk_cpu_powerpc.c endif if MYSUBCPU_GENERIC VOLK_MKTABLES_SOURCES += \ - $(abs_top_srcdir)/lib/volk_cpu_generic.c + $(top_srcdir)/lib/volk_cpu_generic.c endif volk_mktables$(EXEEXT): $(VOLK_MKTABLES_SOURCES) - $(CC) -o $@ $^ $(AM_CPPFLAGS) -I$(abs_top_builddir)/include + $(CC) -o $@ $^ $(AM_CPPFLAGS) -I$(top_builddir)/include volk_tables.h: volk_mktables$(EXEEXT) ./volk_mktables$(EXEEXT) -volk_config.h: $(abs_top_builddir)/volk_config.h - cp $^ $(abs_top_builddir)/include/volk/$@ +volk_config.h: $(top_builddir)/volk_config.h + cp $^ $(top_builddir)/include/volk/$@ distclean-local: rm -f volk_config_fixed.h -- cgit From 6cd7639e86285c2fc0f3771fd1111c4456d477a4 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sat, 11 Dec 2010 13:39:00 -0500 Subject: volk: fix for running 32-bit OS on 64-bit processor. System is correctly identified as 32-bit and compiles with the correct flags. --- volk/include/volk/archs.xml | 10 +++++----- volk/include/volk/make_cpuid_x86_c.py | 26 +++++++++++++++++++++----- volk/include/volk/make_set_simd.py | 25 +++++++++++++++++++++---- 3 files changed, 47 insertions(+), 14 deletions(-) (limited to 'volk') diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml index c9c441008..b7c98500f 100644 --- a/volk/include/volk/archs.xml +++ b/volk/include/volk/archs.xml @@ -9,12 +9,10 @@ maltivec - - 0x80000001 + m32 - d - 29 - 0 + MD_SUBCPU + x86_64 @@ -23,6 +21,8 @@ 29 m64 1 + MD_SUBCPU + x86 diff --git a/volk/include/volk/make_cpuid_x86_c.py b/volk/include/volk/make_cpuid_x86_c.py index fd83e5e5c..8ebe243e5 100644 --- a/volk/include/volk/make_cpuid_x86_c.py +++ b/volk/include/volk/make_cpuid_x86_c.py @@ -33,17 +33,33 @@ def make_cpuid_x86_c(dom) : for domarch in dom: if str(domarch.attributes["type"].value) == "x86": + if "no_test" in domarch.attributes.keys(): + no_test = str(domarch.attributes["no_test"].value); + if no_test == "true": + no_test = True; + else: + no_test = False; + else: + no_test = False; arch = str(domarch.attributes["name"].value); op = domarch.getElementsByTagName("op"); - op = str(op[0].firstChild.data); + if op: + op = str(op[0].firstChild.data); reg = domarch.getElementsByTagName("reg"); - reg = str(reg[0].firstChild.data); + if reg: + reg = str(reg[0].firstChild.data); shift = domarch.getElementsByTagName("shift"); - shift = str(shift[0].firstChild.data); + if shift: + shift = str(shift[0].firstChild.data); val = domarch.getElementsByTagName("val"); - val = str(val[0].firstChild.data); + if val: + val = str(val[0].firstChild.data); - if op == "1": + if no_test: + tempstring = tempstring + "int i_can_has_" + arch + " () {\n" + tempstring = tempstring + " return 1;\n" + tempstring = tempstring + "}\n\n" + elif op == "1": tempstring = tempstring + "int i_can_has_" + arch + " () {\n" tempstring = tempstring + " unsigned int e" + reg + "x = cpuid_e" + reg + "x (" + op + ");\n" tempstring = tempstring + " return ((e" + reg + "x >> " + shift + ") & 1) == " + val + ";\n" diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index 279ce06e2..842366b18 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -112,9 +112,11 @@ def make_set_simd(dom) : tempstring = tempstring + " ADDONS=\"\"\n"; tempstring = tempstring + " BUILT_ARCHS=\"generic\"\n"; tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; + tempstring = tempstring + " OVERRULE_FLAG=\"no\"\n"; tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n"; tempstring = tempstring + " cf_with_lv_arch=$lv_PROCCPU\n"; - + tempstring = tempstring + " OVERRULE_FLAG=\"yes\"\n"; + tempstring = tempstring + " fi\n"; for domarch in dom: if str(domarch.attributes["type"].value) != "all": @@ -132,6 +134,16 @@ def make_set_simd(dom) : for domarch in dom: arch = str(domarch.attributes["name"].value); atype = str(domarch.attributes["type"].value); + overrule = domarch.getElementsByTagName("overrule"); + if overrule: + overrule = str(overrule[0].firstChild.data); + else: + overrule = ""; + overrule_val = domarch.getElementsByTagName("overrule_val"); + if overrule_val: + overrule_val = str(overrule_val[0].firstChild.data); + else: + overrule_val = ""; flag = domarch.getElementsByTagName("flag"); flag = str(flag[0].firstChild.data); if atype == "x86": @@ -153,10 +165,15 @@ def make_set_simd(dom) : tempstring = tempstring + " indLV_ARCH=yes\n" tempstring = tempstring + " fi\n" tempstring = tempstring + " done\n" - tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " fi\n" + + tempstring = tempstring + " if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\""; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " fi\n" tempstring = tempstring + " indCC=no\n" @@ -201,7 +218,7 @@ def make_set_simd(dom) : tempstring = tempstring + " if test \"$indCC\" = yes && test \"indCXX\" = yes && \"indLV_ARCH\" = yes; then\n" tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; tempstring = tempstring + " ADDONS=\"${ADDONS} -" + flag + "\"\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\""; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " fi\n" tempstring = tempstring + " indCC=no\n" -- cgit From 09c84f41c3265d27a7812e95a2906b17a7f01ca8 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Mon, 13 Dec 2010 19:04:05 -0800 Subject: volk: added ORC integration. still requires polishing. --- volk/config/orc.m4 | 67 +++++++++++++++++++++++++++ volk/orc/Makefile.am | 25 ++++++++++ volk/orc/volk_32f_sqrt_aligned16_orc_impl.orc | 4 ++ 3 files changed, 96 insertions(+) create mode 100644 volk/config/orc.m4 create mode 100644 volk/orc/Makefile.am create mode 100644 volk/orc/volk_32f_sqrt_aligned16_orc_impl.orc (limited to 'volk') diff --git a/volk/config/orc.m4 b/volk/config/orc.m4 new file mode 100644 index 000000000..3c501762d --- /dev/null +++ b/volk/config/orc.m4 @@ -0,0 +1,67 @@ +dnl pkg-config-based checks for Orc + +dnl specific: +dnl ORC_CHECK([REQUIRED_VERSION]) + +AC_DEFUN([ORC_CHECK], +[ + ORC_REQ=ifelse([$1], , "0.4.6", [$1]) + + AC_ARG_ENABLE(orc, + AC_HELP_STRING([--enable-orc],[use Orc if installed]), + [case "${enableval}" in + auto) enable_orc=auto ;; + yes) enable_orc=yes ;; + no) enable_orc=no ;; + *) AC_MSG_ERROR(bad value ${enableval} for --enable-orc) ;; + esac + ], + [enable_orc=auto]) dnl Default value + + if test "x$enable_orc" != "xno" ; then + PKG_CHECK_MODULES(ORC, orc-0.4 >= $ORC_REQ, [ + AC_DEFINE(HAVE_ORC, 1, [Use Orc]) + if test "x$ORCC" = "x" ; then + ORCC=`$PKG_CONFIG --variable=orcc orc-0.4` + fi + AC_SUBST(ORCC) + ORCC_FLAGS="--compat $ORC_REQ" + AC_SUBST(ORCC_FLAGS) + HAVE_ORC=yes + HAVE_ORCC=yes + if test "x$cross_compiling" = "xyes" ; then + HAVE_ORCC=no + fi + ], [ + if test "x$enable_orc" = "xyes" ; then + AC_MSG_ERROR([--enable-orc specified, but Orc >= $ORC_REQ not found]) + fi + AC_DEFINE(DISABLE_ORC, 1, [Disable Orc]) + HAVE_ORC=no + HAVE_ORCC=no + ]) + else + AC_DEFINE(DISABLE_ORC, 1, [Disable Orc]) + HAVE_ORC=no + HAVE_ORCC=no + fi + AM_CONDITIONAL(HAVE_ORC, [test "x$HAVE_ORC" = "xyes"]) + AM_CONDITIONAL(HAVE_ORCC, [test "x$HAVE_ORCC" = "xyes"]) + +])) + +AC_DEFUN([ORC_OUTPUT], +[ + if test "$HAVE_ORC" = yes ; then + printf "configure: *** Orc acceleration enabled.\n" + else + if test "x$enable_orc" = "xno" ; then + printf "configure: *** Orc acceleration disabled by --disable-orc.\n" + else + printf "configure: *** Orc acceleration disabled. Requires Orc >= $ORC_REQ, which was\n" + printf " not found.\n" + fi + fi + printf "\n" +]) + diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am new file mode 100644 index 000000000..f5aa7d45c --- /dev/null +++ b/volk/orc/Makefile.am @@ -0,0 +1,25 @@ +# +# Copyright 2008 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +include $(top_srcdir)/Makefile.common + +libvolk_orc_a_SOURCES = volk_32f_sqrt_aligned16_orc_impl.c +libvolk_orc_a_LDFLAGS = -lorc-0.4 +lib_LIBRARIES = libvolk_orc.a diff --git a/volk/orc/volk_32f_sqrt_aligned16_orc_impl.orc b/volk/orc/volk_32f_sqrt_aligned16_orc_impl.orc new file mode 100644 index 000000000..0983271db --- /dev/null +++ b/volk/orc/volk_32f_sqrt_aligned16_orc_impl.orc @@ -0,0 +1,4 @@ +.function volk_32f_sqrt_aligned16_orc_impl +.source 4 src +.dest 4 dst +sqrtf dst, src -- cgit From bef3db60e73953f2d2ecdc6a86a81e11df3b103d Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Mon, 13 Dec 2010 19:18:45 -0800 Subject: volk: committed some stuff i neglected --- volk/Makefile.am | 5 +++-- volk/config/Makefile.am | 1 + volk/config/lv_configure.m4 | 3 +++ volk/configure.ac | 1 + volk/include/volk/Makefile.am | 2 +- volk/include/volk/archs.xml | 4 ++++ volk/include/volk/make_set_simd.py | 3 +++ volk/include/volk/volk_32f_sqrt_aligned16.h | 13 +++++++++++++ volk/lib/Makefile.am | 17 +++++++++++------ volk/lib/qa_32f_sqrt_aligned16.cc | 15 +++++++++++++++ 10 files changed, 55 insertions(+), 9 deletions(-) (limited to 'volk') diff --git a/volk/Makefile.am b/volk/Makefile.am index 3521dd0e4..c1cd9aace 100644 --- a/volk/Makefile.am +++ b/volk/Makefile.am @@ -24,7 +24,7 @@ ACLOCAL_AMFLAGS = -I config include $(top_srcdir)/Makefile.common EXTRA_DIST = bootstrap configure config.h.in volk_config.h -SUBDIRS = config include lib +SUBDIRS = config orc include lib #if USE_PYTHON #SUBDIRS += python #endif @@ -53,4 +53,5 @@ distclean-local: -rm -f include/Makefile.in -rm -f lib/Makefile.in -rm -f python/Makefile.in - -rm -f configure \ No newline at end of file + -rm -f configure + -rm -f orc/Makefile.in diff --git a/volk/config/Makefile.am b/volk/config/Makefile.am index 0e556c6e2..1d0041e35 100644 --- a/volk/config/Makefile.am +++ b/volk/config/Makefile.am @@ -45,6 +45,7 @@ m4macros = \ mkstemp.m4 \ onceonly.m4 \ pkg.m4 \ + orc.m4 \ gcc_version_workaround.m4 diff --git a/volk/config/lv_configure.m4 b/volk/config/lv_configure.m4 index c7a5fe960..f98b2dc5b 100644 --- a/volk/config/lv_configure.m4 +++ b/volk/config/lv_configure.m4 @@ -100,6 +100,9 @@ dnl AM_CONDITIONAL([USE_PYTHON], [test "$with_python" = yes]) GR_PWIN32 GR_LIBGNURADIO_CORE_EXTRA_LDFLAGS + dnl Check for liborc + ORC_CHECK + LDFLAGS="$LDFLAGS $LIBGNURADIO_CORE_EXTRA_LDFLAGS" AC_CHECK_PROG([XMLTO],[xmlto],[yes],[]) diff --git a/volk/configure.ac b/volk/configure.ac index 8f17e5065..8e2f5b8b9 100644 --- a/volk/configure.ac +++ b/volk/configure.ac @@ -76,6 +76,7 @@ AC_CONFIG_FILES([\ include/Makefile \ include/volk/Makefile \ lib/Makefile \ + orc/Makefile \ volk.pc \ ]) diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 04a43bd34..99276ab87 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -179,4 +179,4 @@ distclean-local: rm -f Makefile.in rm -f volk_environment_init.h rm -f volk_mktables - rm -f $(BUILT_SOURCES) \ No newline at end of file + rm -f $(BUILT_SOURCES) diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml index b7c98500f..a828e5ad0 100644 --- a/volk/include/volk/archs.xml +++ b/volk/include/volk/archs.xml @@ -5,6 +5,10 @@ none + + lorc-0.4 + + maltivec diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index 842366b18..e568aebfa 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -111,6 +111,9 @@ def make_set_simd(dom) : tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; tempstring = tempstring + " ADDONS=\"\"\n"; tempstring = tempstring + " BUILT_ARCHS=\"generic\"\n"; + tempstring = tempstring + " if test $HAVE_ORC = yes; then\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} orc\"\n"; + tempstring = tempstring + " fi\n"; tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; tempstring = tempstring + " OVERRULE_FLAG=\"no\"\n"; tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n"; diff --git a/volk/include/volk/volk_32f_sqrt_aligned16.h b/volk/include/volk/volk_32f_sqrt_aligned16.h index 0b2eaf251..f6996ad5f 100644 --- a/volk/include/volk/volk_32f_sqrt_aligned16.h +++ b/volk/include/volk/volk_32f_sqrt_aligned16.h @@ -58,6 +58,19 @@ static inline void volk_32f_sqrt_aligned16_generic(float* cVector, const float* } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +extern void volk_32f_sqrt_aligned16_orc_impl(float *, const float*, unsigned int); +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_aligned16_orc(float* cVector, const float* aVector, unsigned int num_points){ + volk_32f_sqrt_aligned16_orc_impl(cVector, aVector, num_points); +} + +#endif /* LV_HAVE_ORC */ diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 814d438fd..1291b01cd 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -45,7 +45,9 @@ AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) \ # list of programs run by "make check" and "make distcheck" #TESTS = test_all - +#orc stuff gets built in the ORC directory conditional to ORC being enabled. +#it gets linked in during the build of libvolk as an added library. +#there might be a better way to do this. lib_LTLIBRARIES = \ libvolk.la \ @@ -72,6 +74,9 @@ universal_CODE = \ generic_CODE = \ volk_cpu_generic.c + +orc_CODE = \ + volk_cpu_orc.c x86_CODE = \ volk_cpu_x86.c @@ -133,10 +138,9 @@ endif -libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 - -libvolk_la_LIBADD = +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -lorc-0.4 +libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -lorc-0.4 +libvolk_la_LIBADD = ../orc/libvolk_orc.a @@ -233,11 +237,12 @@ libvolk_qa_la_SOURCES = \ qa_32f_stddev_aligned16.cc \ qa_32f_stddev_and_mean_aligned16.cc -libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 +libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -lorc-0.4 libvolk_qa_la_LIBADD = \ libvolk.la \ libvolk_runtime.la \ + ../orc/libvolk_orc.a \ $(CPPUNIT_LIBS) # ---------------------------------------------------------------- diff --git a/volk/lib/qa_32f_sqrt_aligned16.cc b/volk/lib/qa_32f_sqrt_aligned16.cc index 9a5f71de0..81d66dad7 100644 --- a/volk/lib/qa_32f_sqrt_aligned16.cc +++ b/volk/lib/qa_32f_sqrt_aligned16.cc @@ -52,6 +52,14 @@ void qa_32f_sqrt_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + /* for(int i = 0; i < 10; ++i) { printf("inputs: %f\n", input0[i]); @@ -92,6 +100,13 @@ void qa_32f_sqrt_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_32f_sqrt_aligned16_manual(output01, input0, vlen, "sse"); } -- cgit From 611526f9dfba0df4a1a49d47916706438ac194b3 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 01:00:29 -0800 Subject: Volk: Automated more automake for orc. Brought orcc generation in. Shared library libvolk_orc.la. Linking is hackery right now with specified -lorc-0.4 flags; this should change. Otherwise pretty much OK. --- volk/Makefile.am | 1 + volk/config/lv_configure.m4 | 4 ++++ volk/config/orc.m4 | 2 ++ volk/lib/Makefile.am | 10 +++++----- volk/orc/Makefile.am | 14 +++++++++++--- 5 files changed, 23 insertions(+), 8 deletions(-) (limited to 'volk') diff --git a/volk/Makefile.am b/volk/Makefile.am index c1cd9aace..608aa2e27 100644 --- a/volk/Makefile.am +++ b/volk/Makefile.am @@ -55,3 +55,4 @@ distclean-local: -rm -f python/Makefile.in -rm -f configure -rm -f orc/Makefile.in + -rm -f orc/*.c diff --git a/volk/config/lv_configure.m4 b/volk/config/lv_configure.m4 index f98b2dc5b..47ad543f8 100644 --- a/volk/config/lv_configure.m4 +++ b/volk/config/lv_configure.m4 @@ -104,6 +104,10 @@ dnl AM_CONDITIONAL([USE_PYTHON], [test "$with_python" = yes]) ORC_CHECK LDFLAGS="$LDFLAGS $LIBGNURADIO_CORE_EXTRA_LDFLAGS" + + if test HAVE_ORC = true; then + LDFLAGS="$LDFLAGS $ORC_LDFLAGS" + fi AC_CHECK_PROG([XMLTO],[xmlto],[yes],[]) AM_CONDITIONAL([HAS_XMLTO], [test x$XMLTO = xyes]) diff --git a/volk/config/orc.m4 b/volk/config/orc.m4 index 3c501762d..03c12a8f9 100644 --- a/volk/config/orc.m4 +++ b/volk/config/orc.m4 @@ -26,6 +26,7 @@ AC_DEFUN([ORC_CHECK], fi AC_SUBST(ORCC) ORCC_FLAGS="--compat $ORC_REQ" + ORC_LDFLAGS="-lorc-0.4" AC_SUBST(ORCC_FLAGS) HAVE_ORC=yes HAVE_ORCC=yes @@ -44,6 +45,7 @@ AC_DEFUN([ORC_CHECK], AC_DEFINE(DISABLE_ORC, 1, [Disable Orc]) HAVE_ORC=no HAVE_ORCC=no + ORC_LDFLAGS="" fi AM_CONDITIONAL(HAVE_ORC, [test "x$HAVE_ORC" = "xyes"]) AM_CONDITIONAL(HAVE_ORCC, [test "x$HAVE_ORCC" = "xyes"]) diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 1291b01cd..649d461e0 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -138,9 +138,9 @@ endif -libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -lorc-0.4 -libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -lorc-0.4 -libvolk_la_LIBADD = ../orc/libvolk_orc.a +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(ORC_LDFLAGS) -lorc-0.4 +libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(ORC_LDFLAGS) -lorc-0.4 +libvolk_la_LIBADD = ../orc/libvolk_orc.la @@ -237,12 +237,12 @@ libvolk_qa_la_SOURCES = \ qa_32f_stddev_aligned16.cc \ qa_32f_stddev_and_mean_aligned16.cc -libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -lorc-0.4 +libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(ORC_LDFLAGS) -lorc-0.4 libvolk_qa_la_LIBADD = \ libvolk.la \ libvolk_runtime.la \ - ../orc/libvolk_orc.a \ + ../orc/libvolk_orc.la \ $(CPPUNIT_LIBS) # ---------------------------------------------------------------- diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am index f5aa7d45c..092e66534 100644 --- a/volk/orc/Makefile.am +++ b/volk/orc/Makefile.am @@ -19,7 +19,15 @@ # include $(top_srcdir)/Makefile.common +lib_LTLIBRARIES = libvolk_orc.la +libvolk_orc_la_LDFLAGS = $(ORC_LDFLAGS) -libvolk_orc_a_SOURCES = volk_32f_sqrt_aligned16_orc_impl.c -libvolk_orc_a_LDFLAGS = -lorc-0.4 -lib_LIBRARIES = libvolk_orc.a +libvolk_orc_la_SOURCES = \ +volk_32f_sqrt_aligned16_orc_impl.orc + + + +ORCC_FLAGS = --implementation --lazy-init + +.orc.c: + $(ORCC) $(ORCC_FLAGS) -o $@ $< -- cgit From 05f4bced29987a0a573d1fc5b214f3fa01dc84bd Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 13:36:55 -0800 Subject: Volk: More autotools stuff for Orc. Should build OK with or without Orc now. --- volk/Makefile.am | 7 ++++++- volk/config/lv_configure.m4 | 4 ---- volk/config/orc.m4 | 1 - volk/configure.ac | 5 ++++- volk/include/volk/make_set_simd.py | 10 ++++++++-- volk/lib/Makefile.am | 18 +++++++++++++----- 6 files changed, 31 insertions(+), 14 deletions(-) (limited to 'volk') diff --git a/volk/Makefile.am b/volk/Makefile.am index 608aa2e27..d1ebdc78d 100644 --- a/volk/Makefile.am +++ b/volk/Makefile.am @@ -24,7 +24,12 @@ ACLOCAL_AMFLAGS = -I config include $(top_srcdir)/Makefile.common EXTRA_DIST = bootstrap configure config.h.in volk_config.h -SUBDIRS = config orc include lib +SUBDIRS = config +if HAVE_ORC +SUBDIRS += orc +endif +SUBDIRS += include lib + #if USE_PYTHON #SUBDIRS += python #endif diff --git a/volk/config/lv_configure.m4 b/volk/config/lv_configure.m4 index 47ad543f8..f98b2dc5b 100644 --- a/volk/config/lv_configure.m4 +++ b/volk/config/lv_configure.m4 @@ -104,10 +104,6 @@ dnl AM_CONDITIONAL([USE_PYTHON], [test "$with_python" = yes]) ORC_CHECK LDFLAGS="$LDFLAGS $LIBGNURADIO_CORE_EXTRA_LDFLAGS" - - if test HAVE_ORC = true; then - LDFLAGS="$LDFLAGS $ORC_LDFLAGS" - fi AC_CHECK_PROG([XMLTO],[xmlto],[yes],[]) AM_CONDITIONAL([HAS_XMLTO], [test x$XMLTO = xyes]) diff --git a/volk/config/orc.m4 b/volk/config/orc.m4 index 03c12a8f9..fa48b10df 100644 --- a/volk/config/orc.m4 +++ b/volk/config/orc.m4 @@ -45,7 +45,6 @@ AC_DEFUN([ORC_CHECK], AC_DEFINE(DISABLE_ORC, 1, [Disable Orc]) HAVE_ORC=no HAVE_ORCC=no - ORC_LDFLAGS="" fi AM_CONDITIONAL(HAVE_ORC, [test "x$HAVE_ORC" = "xyes"]) AM_CONDITIONAL(HAVE_ORCC, [test "x$HAVE_ORCC" = "xyes"]) diff --git a/volk/configure.ac b/volk/configure.ac index 8e2f5b8b9..5a1eac3f2 100644 --- a/volk/configure.ac +++ b/volk/configure.ac @@ -76,9 +76,12 @@ AC_CONFIG_FILES([\ include/Makefile \ include/volk/Makefile \ lib/Makefile \ - orc/Makefile \ volk.pc \ ]) + +if test "$HAVE_ORC" = yes; then + AC_CONFIG_FILES([orc/Makefile]) +fi AC_OUTPUT diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index e568aebfa..78a00476d 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -183,8 +183,14 @@ def make_set_simd(dom) : tempstring = tempstring + " indCXX=no\n" tempstring = tempstring + " indLV_ARCH=no\n" elif atype == "all": - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + if arch == "orc": + tempstring = tempstring + " if test $HAVE_ORC = yes; then\n"; + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " fi\n"; + else: + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " ;;\n" tempstring = tempstring + " (powerpc)\n" diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 649d461e0..385401ae1 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -138,10 +138,13 @@ endif -libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(ORC_LDFLAGS) -lorc-0.4 -libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(ORC_LDFLAGS) -lorc-0.4 +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 +libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 +if HAVE_ORC libvolk_la_LIBADD = ../orc/libvolk_orc.la - +libvolk_la_LDFLAGS += -lorc-0.4 +libvolk_runtime_la_LDFLAGS += -lorc-0.4 +endif # ---------------------------------------------------------------- @@ -237,13 +240,18 @@ libvolk_qa_la_SOURCES = \ qa_32f_stddev_aligned16.cc \ qa_32f_stddev_and_mean_aligned16.cc -libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(ORC_LDFLAGS) -lorc-0.4 +libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 libvolk_qa_la_LIBADD = \ libvolk.la \ libvolk_runtime.la \ - ../orc/libvolk_orc.la \ $(CPPUNIT_LIBS) + +if HAVE_ORC +libvolk_qa_la_LIBADD += \ + ../orc/libvolk_orc.la + libvolk_qa_la_LDFLAGS += -lorc-0.4 +endif # ---------------------------------------------------------------- # headers that don't get installed -- cgit From be78b530701850b964118fd0f63ba2bbdca9759d Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 14:14:00 -0800 Subject: pre-patch... --- volk/include/volk/volk_32s_and_aligned16.h | 14 +++++++++++++- volk/include/volk/volk_8s_convert_16s_aligned16.h | 12 ++++++++++++ volk/include/volk/volk_8s_convert_32f_aligned16.h | 13 +++++++++++++ volk/orc/Makefile.am | 5 ++++- 4 files changed, 42 insertions(+), 2 deletions(-) (limited to 'volk') diff --git a/volk/include/volk/volk_32s_and_aligned16.h b/volk/include/volk/volk_32s_and_aligned16.h index e9f1e3a43..16c63fd48 100644 --- a/volk/include/volk/volk_32s_and_aligned16.h +++ b/volk/include/volk/volk_32s_and_aligned16.h @@ -63,7 +63,19 @@ static inline void volk_32s_and_aligned16_generic(int32_t* cVector, const int32_ } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +extern void volk_32s_and_aligned16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32s_and_aligned16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32s_and_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_32s_AND_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8s_convert_16s_aligned16.h b/volk/include/volk/volk_8s_convert_16s_aligned16.h index 0efe3c6a1..c52c64eae 100644 --- a/volk/include/volk/volk_8s_convert_16s_aligned16.h +++ b/volk/include/volk/volk_8s_convert_16s_aligned16.h @@ -65,6 +65,18 @@ static inline void volk_8s_convert_16s_aligned16_generic(int16_t* outputVector, } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +extern void volk_8s_convert_16s_aligned16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); +static inline void volk_8s_convert_16s_aligned16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + volk_8s_convert_16s_aligned16_orc_impl(outputVector, inputVector, num_points); +} +#endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_8s_convert_32f_aligned16.h b/volk/include/volk/volk_8s_convert_32f_aligned16.h index 54b66ef8f..700a0fa42 100644 --- a/volk/include/volk/volk_8s_convert_32f_aligned16.h +++ b/volk/include/volk/volk_8s_convert_32f_aligned16.h @@ -86,6 +86,19 @@ static inline void volk_8s_convert_32f_aligned16_generic(float* outputVector, co } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +extern void volk_8s_convert_32f_aligned16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); +static inline void volk_8s_convert_32f_aligned16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + volk_8s_convert_32f_aligned16_orc_impl(outputVector, inputVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am index 092e66534..a4af7dac6 100644 --- a/volk/orc/Makefile.am +++ b/volk/orc/Makefile.am @@ -23,7 +23,10 @@ lib_LTLIBRARIES = libvolk_orc.la libvolk_orc_la_LDFLAGS = $(ORC_LDFLAGS) libvolk_orc_la_SOURCES = \ -volk_32f_sqrt_aligned16_orc_impl.orc +volk_32f_sqrt_aligned16_orc_impl.orc \ +volk_8s_convert_16s_aligned16_orc_impl.orc \ +volk_8s_convert_32f_aligned16_orc_impl.orc \ +volk_32s_and_aligned16_orc_impl -- cgit From cbc44555a0ee99867dd51a1a7b10f19d9935a5c7 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 14:18:59 -0800 Subject: Volk: added some more Orc functions. Haven't added to tests yet. --- volk/orc/volk_32s_and_aligned16_orc_impl.orc | 5 +++++ volk/orc/volk_8s_convert_16s_aligned16_orc_impl.orc | 4 ++++ volk/orc/volk_8s_convert_32f_aligned16_orc_impl.orc | 9 +++++++++ 3 files changed, 18 insertions(+) create mode 100644 volk/orc/volk_32s_and_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_8s_convert_16s_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_8s_convert_32f_aligned16_orc_impl.orc (limited to 'volk') diff --git a/volk/orc/volk_32s_and_aligned16_orc_impl.orc b/volk/orc/volk_32s_and_aligned16_orc_impl.orc new file mode 100644 index 000000000..9d3c7b733 --- /dev/null +++ b/volk/orc/volk_32s_and_aligned16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32s_and_aligned16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +andl dst, src1, src2 diff --git a/volk/orc/volk_8s_convert_16s_aligned16_orc_impl.orc b/volk/orc/volk_8s_convert_16s_aligned16_orc_impl.orc new file mode 100644 index 000000000..8322b529a --- /dev/null +++ b/volk/orc/volk_8s_convert_16s_aligned16_orc_impl.orc @@ -0,0 +1,4 @@ +.function volk_8s_convert_16s_aligned16_orc_impl +.source 1 src +.dest 2 dst +mulsbw dst, src, 255 diff --git a/volk/orc/volk_8s_convert_32f_aligned16_orc_impl.orc b/volk/orc/volk_8s_convert_32f_aligned16_orc_impl.orc new file mode 100644 index 000000000..91a0084d7 --- /dev/null +++ b/volk/orc/volk_8s_convert_32f_aligned16_orc_impl.orc @@ -0,0 +1,9 @@ +.function volk_8s_convert_32f_aligned16_orc_impl +.source 2 src +.dest 4 dst +.floatparam 4 scalar +.temp 4 flsrc +.temp 4 lsrc +convswl lsrc, src +convlf flsrc, lsrc +mulf dst, flsrc, scalar -- cgit From ebcfe1a6507d37a29890d5b29aae308fd02d6716 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 14:25:44 -0800 Subject: Fixed some broken. --- volk/orc/Makefile.am | 3 ++- volk/orc/volk_32f_add_aligned16_orc_impl.orc | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 volk/orc/volk_32f_add_aligned16_orc_impl.orc (limited to 'volk') diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am index a4af7dac6..dcde63083 100644 --- a/volk/orc/Makefile.am +++ b/volk/orc/Makefile.am @@ -26,7 +26,8 @@ libvolk_orc_la_SOURCES = \ volk_32f_sqrt_aligned16_orc_impl.orc \ volk_8s_convert_16s_aligned16_orc_impl.orc \ volk_8s_convert_32f_aligned16_orc_impl.orc \ -volk_32s_and_aligned16_orc_impl +volk_32s_and_aligned16_orc_impl.orc \ +volk_32f_add_aligned16_orc_impl.orc diff --git a/volk/orc/volk_32f_add_aligned16_orc_impl.orc b/volk/orc/volk_32f_add_aligned16_orc_impl.orc new file mode 100644 index 000000000..20e000f68 --- /dev/null +++ b/volk/orc/volk_32f_add_aligned16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32f_add_aligned16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +addf dst, src1, src2 -- cgit From d8031649fa3186d7e6b000dcfaa349deacf51262 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 16:41:14 -0800 Subject: Volk: patch via Nick M. --- volk/config/orc.m4 | 5 ++++- volk/lib/Makefile.am | 5 +---- volk/orc/Makefile.am | 6 ++++-- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'volk') diff --git a/volk/config/orc.m4 b/volk/config/orc.m4 index fa48b10df..7845a940c 100644 --- a/volk/config/orc.m4 +++ b/volk/config/orc.m4 @@ -26,8 +26,11 @@ AC_DEFUN([ORC_CHECK], fi AC_SUBST(ORCC) ORCC_FLAGS="--compat $ORC_REQ" - ORC_LDFLAGS="-lorc-0.4" + ORC_LDFLAGS=`$PKG_CONFIG --libs orc-0.4` + ORC_CFLAGS=`$PKG_CONFIG --cflags orc-0.4` AC_SUBST(ORCC_FLAGS) + AC_SUBST(ORC_LDFLAGS) + AC_SUBST(ORC_CFLAGS) HAVE_ORC=yes HAVE_ORCC=yes if test "x$cross_compiling" = "xyes" ; then diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 385401ae1..d38004f2a 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -74,9 +74,6 @@ universal_CODE = \ generic_CODE = \ volk_cpu_generic.c - -orc_CODE = \ - volk_cpu_orc.c x86_CODE = \ volk_cpu_x86.c @@ -356,7 +353,7 @@ noinst_PROGRAMS = \ test_all test_all_SOURCES = test_all.cc -test_all_LDADD = libvolk_qa.la +test_all_LDADD = libvolk_qa.la ../orc/libvolk_orc.la distclean-local: diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am index dcde63083..5d82f540f 100644 --- a/volk/orc/Makefile.am +++ b/volk/orc/Makefile.am @@ -18,6 +18,8 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # +AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(ORC_CFLAGS) + include $(top_srcdir)/Makefile.common lib_LTLIBRARIES = libvolk_orc.la libvolk_orc_la_LDFLAGS = $(ORC_LDFLAGS) @@ -31,7 +33,7 @@ volk_32f_add_aligned16_orc_impl.orc -ORCC_FLAGS = --implementation --lazy-init +my_ORCC_FLAGS = --implementation --lazy-init $(ORCC_FLAGS) .orc.c: - $(ORCC) $(ORCC_FLAGS) -o $@ $< + $(ORCC) $(my_ORCC_FLAGS) -o $@ $< -- cgit From 2e9a7d350713b4e1b21458db8f3fce8a557858ae Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 17:13:40 -0800 Subject: Volk: Added QA tests for all the Orc stuff. Added a 16u_byteswap but it's broken right now. --- volk/include/volk/volk_16u_byteswap_aligned16.h | 12 ++++++++++++ volk/include/volk/volk_32f_add_aligned16.h | 14 +++++++++++++- volk/lib/qa_16u_byteswap_aligned16.cc | 9 +++++++++ volk/lib/qa_32f_add_aligned16.cc | 9 +++++++++ volk/lib/qa_32s_and_aligned16.cc | 9 +++++++++ volk/lib/qa_8s_convert_32f_aligned16.cc | 8 ++++++++ volk/orc/Makefile.am | 3 ++- volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc | 4 ++++ 8 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc (limited to 'volk') diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h index 698e958e4..8e628ab17 100644 --- a/volk/include/volk/volk_16u_byteswap_aligned16.h +++ b/volk/include/volk/volk_16u_byteswap_aligned16.h @@ -61,5 +61,17 @@ static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, uns } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, uint16_t* intsToSwapAgain, unsigned int num_points); +static inline void volk_16u_byteswap_aligned16_orc(uint16_t* intsToSwap, unsigned int num_points){ + volk_16u_byteswap_aligned16_orc_impl(intsToSwap, intsToSwap, num_points); +} +#endif /* LV_HAVE_ORC */ + #endif /* INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_add_aligned16.h b/volk/include/volk/volk_32f_add_aligned16.h index 721c60fd6..e7d8de265 100644 --- a/volk/include/volk/volk_32f_add_aligned16.h +++ b/volk/include/volk/volk_32f_add_aligned16.h @@ -63,7 +63,19 @@ static inline void volk_32f_add_aligned16_generic(float* cVector, const float* a } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +extern void volk_32f_add_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_add_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_add_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_32f_ADD_ALIGNED16_H */ diff --git a/volk/lib/qa_16u_byteswap_aligned16.cc b/volk/lib/qa_16u_byteswap_aligned16.cc index 6b19828a4..c30b6ba41 100644 --- a/volk/lib/qa_16u_byteswap_aligned16.cc +++ b/volk/lib/qa_16u_byteswap_aligned16.cc @@ -24,6 +24,7 @@ void qa_16u_byteswap_aligned16::t1() { uint16_t output0[vlen] __attribute__ ((aligned (16))); uint16_t output01[vlen] __attribute__ ((aligned (16))); + uint16_t output02[vlen] __attribute__ ((aligned (16))); for(int i = 0; i < vlen; ++i) { output0[i] = (uint16_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2)); @@ -40,6 +41,13 @@ void qa_16u_byteswap_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16u_byteswap_aligned16_manual(output02, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_16u_byteswap_aligned16_manual(output01, vlen, "sse2"); } @@ -54,6 +62,7 @@ void qa_16u_byteswap_aligned16::t1() { for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); } } diff --git a/volk/lib/qa_32f_add_aligned16.cc b/volk/lib/qa_32f_add_aligned16.cc index 002aebfc9..d9214e8a2 100644 --- a/volk/lib/qa_32f_add_aligned16.cc +++ b/volk/lib/qa_32f_add_aligned16.cc @@ -78,6 +78,7 @@ void qa_32f_add_aligned16::t1() { float output0[vlen] __attribute__ ((aligned (16))); float output01[vlen] __attribute__ ((aligned (16))); + float output02[vlen] __attribute__ ((aligned (16))); for(int i = 0; i < vlen; ++i) { input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); @@ -93,6 +94,13 @@ void qa_32f_add_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_add_aligned16_manual(output02, input0, input1, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_32f_add_aligned16_manual(output01, input0, input1, vlen, "sse"); } @@ -107,6 +115,7 @@ void qa_32f_add_aligned16::t1() { for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); } } diff --git a/volk/lib/qa_32s_and_aligned16.cc b/volk/lib/qa_32s_and_aligned16.cc index 72d05cf6f..5720ee869 100644 --- a/volk/lib/qa_32s_and_aligned16.cc +++ b/volk/lib/qa_32s_and_aligned16.cc @@ -25,6 +25,7 @@ void qa_32s_and_aligned16::t1() { int32_t output0[vlen] __attribute__ ((aligned (16))); int32_t output01[vlen] __attribute__ ((aligned (16))); + int32_t output02[vlen] __attribute__ ((aligned (16))); for(int i = 0; i < vlen; ++i) { input0[i] = ((int32_t) (rand() - (RAND_MAX/2))); @@ -40,6 +41,13 @@ void qa_32s_and_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_and_aligned16_manual(output02, input0, input1, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_32s_and_aligned16_manual(output01, input0, input1, vlen, "sse"); } @@ -54,6 +62,7 @@ void qa_32s_and_aligned16::t1() { for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); } } diff --git a/volk/lib/qa_8s_convert_32f_aligned16.cc b/volk/lib/qa_8s_convert_32f_aligned16.cc index 522da0b9d..3b3aa6919 100644 --- a/volk/lib/qa_8s_convert_32f_aligned16.cc +++ b/volk/lib/qa_8s_convert_32f_aligned16.cc @@ -40,6 +40,14 @@ void qa_8s_convert_32f_aligned16::t1() { end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_8s_convert_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); start = clock(); for(int count = 0; count < ITERS; ++count) { diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am index 5d82f540f..3f155d0be 100644 --- a/volk/orc/Makefile.am +++ b/volk/orc/Makefile.am @@ -29,7 +29,8 @@ volk_32f_sqrt_aligned16_orc_impl.orc \ volk_8s_convert_16s_aligned16_orc_impl.orc \ volk_8s_convert_32f_aligned16_orc_impl.orc \ volk_32s_and_aligned16_orc_impl.orc \ -volk_32f_add_aligned16_orc_impl.orc +volk_32f_add_aligned16_orc_impl.orc \ +volk_16u_byteswap_aligned16_orc_impl.orc diff --git a/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc b/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc new file mode 100644 index 000000000..73c3e345e --- /dev/null +++ b/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc @@ -0,0 +1,4 @@ +.function volk_16u_byteswap_aligned16_orc_impl +.dest 2 dst +.source 2 src +swapw dst, src -- cgit From 26415a1445490cc3230c5d793a41703931ae9d01 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 17:23:20 -0800 Subject: Volk: Nick's commits to make adding Orc a little more structurally sound --- volk/include/volk/archs.xml | 2 ++ volk/include/volk/make_set_simd.py | 22 ++++++++++------------ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'volk') diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml index a828e5ad0..b61bfca09 100644 --- a/volk/include/volk/archs.xml +++ b/volk/include/volk/archs.xml @@ -7,6 +7,8 @@ lorc-0.4 + HAVE_ORC + no diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index 78a00476d..d7109cfcb 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -110,10 +110,7 @@ def make_set_simd(dom) : arch = str(domarch.attributes["name"].value); tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n"; tempstring = tempstring + " ADDONS=\"\"\n"; - tempstring = tempstring + " BUILT_ARCHS=\"generic\"\n"; - tempstring = tempstring + " if test $HAVE_ORC = yes; then\n"; - tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} orc\"\n"; - tempstring = tempstring + " fi\n"; + tempstring = tempstring + " BUILT_ARCHS=\"\"\n"; tempstring = tempstring + " _MAKE_FAKE_PROCCPU\n"; tempstring = tempstring + " OVERRULE_FLAG=\"no\"\n"; tempstring = tempstring + " if test -z \"$cf_with_lv_arch\"; then\n"; @@ -183,14 +180,11 @@ def make_set_simd(dom) : tempstring = tempstring + " indCXX=no\n" tempstring = tempstring + " indLV_ARCH=no\n" elif atype == "all": - if arch == "orc": - tempstring = tempstring + " if test $HAVE_ORC = yes; then\n"; - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; - tempstring = tempstring + " fi\n"; - else: - tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; - tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " if test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n" + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; + tempstring = tempstring + " fi\n" tempstring = tempstring + " ;;\n" tempstring = tempstring + " (powerpc)\n" @@ -234,11 +228,15 @@ def make_set_simd(dom) : tempstring = tempstring + " indCXX=no\n" tempstring = tempstring + " indLV_ARCH=no\n" elif atype == "all": + tempstring = tempstring + " if test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n" tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; + tempstring = tempstring + " fi\n" tempstring = tempstring + " ;;\n" tempstring = tempstring + " esac\n" tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" + tempstring = tempstring + " AM_CONDITIONAL(LV_HAVE_ORC, [test \"$LV_HAVE_ORC\" = \"yes\"])\n"; tempstring = tempstring + "])\n" return tempstring; -- cgit From 87a9b14e0b0e2c2d0dcd75d42f2a15211265f102 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 17:44:34 -0800 Subject: Volk: added references to libs instead of specifying them directly --- volk/lib/Makefile.am | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'volk') diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index d38004f2a..faab4a010 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -133,14 +133,21 @@ libvolk_runtime_la_SOURCES = \ $(universal_runtime_CODE) endif +volk_orc_LDFLAGS = \ + $(ORC_LDFLAGS) \ + -lorc-0.4 + +volk_orc_LIBADD = \ + ../orc/libvolk_orc.la - +if HAVE_ORC +libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) +libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) +libvolk_la_LIBADD = $(volk_orc_LIBADD) +else libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -if HAVE_ORC -libvolk_la_LIBADD = ../orc/libvolk_orc.la -libvolk_la_LDFLAGS += -lorc-0.4 -libvolk_runtime_la_LDFLAGS += -lorc-0.4 +libvolk_la_LIBADD = endif @@ -243,12 +250,6 @@ libvolk_qa_la_LIBADD = \ libvolk.la \ libvolk_runtime.la \ $(CPPUNIT_LIBS) - -if HAVE_ORC -libvolk_qa_la_LIBADD += \ - ../orc/libvolk_orc.la - libvolk_qa_la_LDFLAGS += -lorc-0.4 -endif # ---------------------------------------------------------------- # headers that don't get installed @@ -353,7 +354,7 @@ noinst_PROGRAMS = \ test_all test_all_SOURCES = test_all.cc -test_all_LDADD = libvolk_qa.la ../orc/libvolk_orc.la +test_all_LDADD = libvolk_qa.la distclean-local: -- cgit From 21426265324c883c91eeaaf75a81f2ccdc6e249d Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 21:12:49 -0800 Subject: Volk: Build fixes to work with/without Orc. --- volk/Makefile.am | 2 +- volk/config/orc.m4 | 14 ++--------- volk/include/volk/archs.xml | 2 +- volk/include/volk/make_set_simd.py | 51 +++++++++++++++++++++++++++++++++++--- volk/lib/Makefile.am | 2 +- 5 files changed, 52 insertions(+), 19 deletions(-) (limited to 'volk') diff --git a/volk/Makefile.am b/volk/Makefile.am index d1ebdc78d..271d495cd 100644 --- a/volk/Makefile.am +++ b/volk/Makefile.am @@ -25,7 +25,7 @@ include $(top_srcdir)/Makefile.common EXTRA_DIST = bootstrap configure config.h.in volk_config.h SUBDIRS = config -if HAVE_ORC +if LV_HAVE_ORC SUBDIRS += orc endif SUBDIRS += include lib diff --git a/volk/config/orc.m4 b/volk/config/orc.m4 index 7845a940c..9645661b0 100644 --- a/volk/config/orc.m4 +++ b/volk/config/orc.m4 @@ -6,18 +6,8 @@ dnl ORC_CHECK([REQUIRED_VERSION]) AC_DEFUN([ORC_CHECK], [ ORC_REQ=ifelse([$1], , "0.4.6", [$1]) - - AC_ARG_ENABLE(orc, - AC_HELP_STRING([--enable-orc],[use Orc if installed]), - [case "${enableval}" in - auto) enable_orc=auto ;; - yes) enable_orc=yes ;; - no) enable_orc=no ;; - *) AC_MSG_ERROR(bad value ${enableval} for --enable-orc) ;; - esac - ], - [enable_orc=auto]) dnl Default value - + + enable_orc = auto if test "x$enable_orc" != "xno" ; then PKG_CHECK_MODULES(ORC, orc-0.4 >= $ORC_REQ, [ AC_DEFINE(HAVE_ORC, 1, [Use Orc]) diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml index b61bfca09..a19a5add9 100644 --- a/volk/include/volk/archs.xml +++ b/volk/include/volk/archs.xml @@ -7,7 +7,7 @@ lorc-0.4 - HAVE_ORC + LV_HAVE_ORC no diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index d7109cfcb..f2b7c0656 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -180,11 +180,22 @@ def make_set_simd(dom) : tempstring = tempstring + " indCXX=no\n" tempstring = tempstring + " indLV_ARCH=no\n" elif atype == "all": - tempstring = tempstring + " if test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n" + tempstring = tempstring + " for i in $cf_with_lv_arch\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; + tempstring = tempstring + " indLV_ARCH=yes\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; tempstring = tempstring + " fi\n" + tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " ;;\n" tempstring = tempstring + " (powerpc)\n" @@ -228,11 +239,44 @@ def make_set_simd(dom) : tempstring = tempstring + " indCXX=no\n" tempstring = tempstring + " indLV_ARCH=no\n" elif atype == "all": - tempstring = tempstring + " if test -z \"" + overrule + "\" || test \"$" + overrule + "\" != \"" + overrule_val + "\" || test \"$OVERRULE_FLAG\" == \"no\"; then\n" + tempstring = tempstring + " for i in $cf_with_lv_arch\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; + tempstring = tempstring + " indLV_ARCH=yes\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; + tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; + tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; + tempstring = tempstring + " fi\n" + tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " ;;\n" + tempstring = tempstring + " (*)\n" + for domarch in dom: + arch = str(domarch.attributes["name"].value); + atype = str(domarch.attributes["type"].value); + flag = domarch.getElementsByTagName("flag"); + flag = str(flag[0].firstChild.data); + if atype == "all": + tempstring = tempstring + " for i in $cf_with_lv_arch\n" + tempstring = tempstring + " do\n" + tempstring = tempstring + " if test \"X$i\" = X" + arch + "; then\n"; + tempstring = tempstring + " indLV_ARCH=yes\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " done\n" + tempstring = tempstring + " if test -n \"" + overrule + "\" && test \"$" + overrule + "\" == \"" + overrule_val + "\" && test \"$OVERRULE_FLAG\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n" + tempstring = tempstring + " indLV_ARCH=no\n" + tempstring = tempstring + " fi\n" + tempstring = tempstring + " if test \"$indLV_ARCH\" == \"yes\"; then\n" tempstring = tempstring + " AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n"; tempstring = tempstring + " LV_HAVE_" + arch.swapcase() + "=yes\n"; tempstring = tempstring + " BUILT_ARCHS=\"${BUILT_ARCHS} " + arch + "\"\n"; tempstring = tempstring + " fi\n" + tempstring = tempstring + " indLV_ARCH=no\n" tempstring = tempstring + " ;;\n" tempstring = tempstring + " esac\n" tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" @@ -240,6 +284,5 @@ def make_set_simd(dom) : tempstring = tempstring + "])\n" return tempstring; - - + diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index faab4a010..253033461 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -140,7 +140,7 @@ volk_orc_LDFLAGS = \ volk_orc_LIBADD = \ ../orc/libvolk_orc.la -if HAVE_ORC +if LV_HAVE_ORC libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 $(volk_orc_LDFLAGS) libvolk_la_LIBADD = $(volk_orc_LIBADD) -- cgit From f9ee6a55cb397f9302769a25a8c959fa162354f0 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 14 Dec 2010 22:58:33 -0800 Subject: Volk: Some new basic Orc implementations with QA code --- volk/include/volk/volk_16u_byteswap_aligned16.h | 4 ++-- volk/include/volk/volk_32f_divide_aligned16.h | 13 +++++++++++++ volk/include/volk/volk_32f_multiply_aligned16.h | 14 +++++++++++++- volk/include/volk/volk_32f_subtract_aligned16.h | 14 ++++++++++++++ volk/lib/qa_16u_byteswap_aligned16.cc | 1 + volk/lib/qa_32f_divide_aligned16.cc | 10 ++++++++++ volk/lib/qa_32f_multiply_aligned16.cc | 9 +++++++++ volk/lib/qa_32f_subtract_aligned16.cc | 9 +++++++++ volk/orc/Makefile.am | 7 +++++-- volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc | 3 +-- volk/orc/volk_32f_divide_aligned16_orc_impl.orc | 5 +++++ volk/orc/volk_32f_multiply_aligned16_orc_impl.orc | 5 +++++ volk/orc/volk_32f_subtract_aligned16_orc_impl.orc | 5 +++++ 13 files changed, 92 insertions(+), 7 deletions(-) create mode 100644 volk/orc/volk_32f_divide_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32f_multiply_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32f_subtract_aligned16_orc_impl.orc (limited to 'volk') diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h index 8e628ab17..9d19d1a45 100644 --- a/volk/include/volk/volk_16u_byteswap_aligned16.h +++ b/volk/include/volk/volk_16u_byteswap_aligned16.h @@ -67,9 +67,9 @@ static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, uns \param intsToSwap The vector of data to byte swap \param numDataPoints The number of data points */ -extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, uint16_t* intsToSwapAgain, unsigned int num_points); +extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, unsigned int num_points); static inline void volk_16u_byteswap_aligned16_orc(uint16_t* intsToSwap, unsigned int num_points){ - volk_16u_byteswap_aligned16_orc_impl(intsToSwap, intsToSwap, num_points); + volk_16u_byteswap_aligned16_orc_impl(intsToSwap, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_divide_aligned16.h b/volk/include/volk/volk_32f_divide_aligned16.h index c00700cd8..c595b5e92 100644 --- a/volk/include/volk/volk_32f_divide_aligned16.h +++ b/volk/include/volk/volk_32f_divide_aligned16.h @@ -63,6 +63,19 @@ static inline void volk_32f_divide_aligned16_generic(float* cVector, const float } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +extern void volk_32f_divide_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_divide_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_divide_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_32f_multiply_aligned16.h b/volk/include/volk/volk_32f_multiply_aligned16.h index b557580ab..87ae7bcf8 100644 --- a/volk/include/volk/volk_32f_multiply_aligned16.h +++ b/volk/include/volk/volk_32f_multiply_aligned16.h @@ -63,7 +63,19 @@ static inline void volk_32f_multiply_aligned16_generic(float* cVector, const flo } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +extern void volk_32f_multiply_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_multiply_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_subtract_aligned16.h b/volk/include/volk/volk_32f_subtract_aligned16.h index ac3f5e5d1..e15242901 100644 --- a/volk/include/volk/volk_32f_subtract_aligned16.h +++ b/volk/include/volk/volk_32f_subtract_aligned16.h @@ -63,5 +63,19 @@ static inline void volk_32f_subtract_aligned16_generic(float* cVector, const flo } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +extern void volk_32f_subtract_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_subtract_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_subtract_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + #endif /* INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H */ diff --git a/volk/lib/qa_16u_byteswap_aligned16.cc b/volk/lib/qa_16u_byteswap_aligned16.cc index c30b6ba41..b740f91df 100644 --- a/volk/lib/qa_16u_byteswap_aligned16.cc +++ b/volk/lib/qa_16u_byteswap_aligned16.cc @@ -30,6 +30,7 @@ void qa_16u_byteswap_aligned16::t1() { output0[i] = (uint16_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2)); } memcpy(output01, output0, vlen*sizeof(uint16_t)); + memcpy(output02, output0, vlen*sizeof(uint16_t)); printf("16u_byteswap_aligned\n"); diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc index 8826bf94f..f104e0443 100644 --- a/volk/lib/qa_32f_divide_aligned16.cc +++ b/volk/lib/qa_32f_divide_aligned16.cc @@ -35,6 +35,7 @@ void qa_32f_divide_aligned16::t1() { float input1[vlen] __attribute__ ((aligned (16))); float output0[vlen] __attribute__ ((aligned (16))); + float output1[vlen] __attribute__ ((aligned (16))); float output_known[vlen] __attribute__ ((aligned (16))); for(int i = 0; i < vlen; ++i) { @@ -51,6 +52,14 @@ void qa_32f_divide_aligned16::t1() { end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_divide_aligned16_manual(output1, input0, input1, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); /* for(int i = 0; i < 10; ++i) { @@ -61,6 +70,7 @@ void qa_32f_divide_aligned16::t1() { for(int i = 0; i < vlen; ++i) { CPPUNIT_ASSERT_EQUAL(output0[i], output_known[i]); + CPPUNIT_ASSERT_EQUAL(output1[i], output_known[i]); } } diff --git a/volk/lib/qa_32f_multiply_aligned16.cc b/volk/lib/qa_32f_multiply_aligned16.cc index e52748466..f9c034d70 100644 --- a/volk/lib/qa_32f_multiply_aligned16.cc +++ b/volk/lib/qa_32f_multiply_aligned16.cc @@ -78,6 +78,7 @@ void qa_32f_multiply_aligned16::t1() { float output0[vlen] __attribute__ ((aligned (16))); float output01[vlen] __attribute__ ((aligned (16))); + float output02[vlen] __attribute__ ((aligned (16))); for(int i = 0; i < vlen; ++i) { input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); @@ -93,6 +94,13 @@ void qa_32f_multiply_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_multiply_aligned16_manual(output02, input0, input1, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_32f_multiply_aligned16_manual(output01, input0, input1, vlen, "sse"); } @@ -107,6 +115,7 @@ void qa_32f_multiply_aligned16::t1() { for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); } } diff --git a/volk/lib/qa_32f_subtract_aligned16.cc b/volk/lib/qa_32f_subtract_aligned16.cc index a7e1b5ae3..5a5a7c9b6 100644 --- a/volk/lib/qa_32f_subtract_aligned16.cc +++ b/volk/lib/qa_32f_subtract_aligned16.cc @@ -25,6 +25,7 @@ void qa_32f_subtract_aligned16::t1() { float output0[vlen] __attribute__ ((aligned (16))); float output01[vlen] __attribute__ ((aligned (16))); + float output02[vlen] __attribute__ ((aligned (16))); for(int i = 0; i < vlen; ++i) { input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); @@ -40,6 +41,13 @@ void qa_32f_subtract_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_subtract_aligned16_manual(output02, input0, input1, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_32f_subtract_aligned16_manual(output01, input0, input1, vlen, "sse"); } @@ -54,6 +62,7 @@ void qa_32f_subtract_aligned16::t1() { for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); } } diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am index 3f155d0be..c71625d87 100644 --- a/volk/orc/Makefile.am +++ b/volk/orc/Makefile.am @@ -25,12 +25,15 @@ lib_LTLIBRARIES = libvolk_orc.la libvolk_orc_la_LDFLAGS = $(ORC_LDFLAGS) libvolk_orc_la_SOURCES = \ -volk_32f_sqrt_aligned16_orc_impl.orc \ volk_8s_convert_16s_aligned16_orc_impl.orc \ volk_8s_convert_32f_aligned16_orc_impl.orc \ +volk_16u_byteswap_aligned16_orc_impl.orc \ volk_32s_and_aligned16_orc_impl.orc \ volk_32f_add_aligned16_orc_impl.orc \ -volk_16u_byteswap_aligned16_orc_impl.orc +volk_32f_subtract_aligned16_orc_impl.orc \ +volk_32f_divide_aligned16_orc_impl.orc \ +volk_32f_multiply_aligned16_orc_impl.orc \ +volk_32f_sqrt_aligned16_orc_impl.orc diff --git a/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc b/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc index 73c3e345e..3ffd12ec0 100644 --- a/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc +++ b/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc @@ -1,4 +1,3 @@ .function volk_16u_byteswap_aligned16_orc_impl .dest 2 dst -.source 2 src -swapw dst, src +swapw dst, dst diff --git a/volk/orc/volk_32f_divide_aligned16_orc_impl.orc b/volk/orc/volk_32f_divide_aligned16_orc_impl.orc new file mode 100644 index 000000000..870843f2a --- /dev/null +++ b/volk/orc/volk_32f_divide_aligned16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32f_divide_aligned16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +divf dst, src1, src2 diff --git a/volk/orc/volk_32f_multiply_aligned16_orc_impl.orc b/volk/orc/volk_32f_multiply_aligned16_orc_impl.orc new file mode 100644 index 000000000..23619af4e --- /dev/null +++ b/volk/orc/volk_32f_multiply_aligned16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32f_multiply_aligned16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +mulf dst, src1, src2 diff --git a/volk/orc/volk_32f_subtract_aligned16_orc_impl.orc b/volk/orc/volk_32f_subtract_aligned16_orc_impl.orc new file mode 100644 index 000000000..17dbcad46 --- /dev/null +++ b/volk/orc/volk_32f_subtract_aligned16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32f_subtract_aligned16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +subf dst, src1, src2 -- cgit From 15ad4b5398e474bfb52fdb7e826b69f3e398c0b0 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 15 Dec 2010 16:27:42 -0800 Subject: Volk: A bunch of new ORC routines plus tests. Also fixed a typo in the generic version of 16sc_magnitude_16s_a16. --- .../volk/volk_16sc_magnitude_16s_aligned16.h | 15 +++++++++++-- .../volk/volk_16sc_magnitude_32f_aligned16.h | 14 +++++++++++- .../volk/volk_32fc_magnitude_16s_aligned16.h | 14 +++++++++++- .../volk/volk_32fc_magnitude_32f_aligned16.h | 13 ++++++++++- volk/include/volk/volk_32s_or_aligned16.h | 14 +++++++++++- volk/lib/qa_16sc_magnitude_16s_aligned16.cc | 9 ++++++++ volk/lib/qa_16sc_magnitude_32f_aligned16.cc | 20 +++++++++++++++++ volk/lib/qa_32f_divide_aligned16.cc | 9 ++++++++ volk/lib/qa_32fc_magnitude_16s_aligned16.cc | 9 ++++++++ volk/lib/qa_32fc_magnitude_32f_aligned16.cc | 9 ++++++++ volk/lib/qa_32s_or_aligned16.cc | 9 ++++++++ volk/orc/Makefile.am | 6 +++++- .../volk_16sc_magnitude_16s_aligned16_orc_impl.orc | 24 +++++++++++++++++++++ .../volk_16sc_magnitude_32f_aligned16_orc_impl.orc | 25 ++++++++++++++++++++++ .../volk_32fc_magnitude_16s_aligned16_orc_impl.orc | 25 ++++++++++++++++++++++ .../volk_32fc_magnitude_32f_aligned16_orc_impl.orc | 21 ++++++++++++++++++ volk/orc/volk_32s_or_aligned16_orc_impl.orc | 5 +++++ 17 files changed, 234 insertions(+), 7 deletions(-) create mode 100644 volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32s_or_aligned16_orc_impl.orc (limited to 'volk') diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h index 1482ab82e..9f3222aa6 100644 --- a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h +++ b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h @@ -164,7 +164,7 @@ static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeV const int16_t* complexVectorPtr = (const int16_t*)complexVector; int16_t* magnitudeVectorPtr = magnitudeVector; unsigned int number = 0; - const float scalar = 32786.0; + const float scalar = 32768.0; for(number = 0; number < num_points; number++){ float real = ((float)(*complexVectorPtr++)) / scalar; float imag = ((float)(*complexVectorPtr++)) / scalar; @@ -173,7 +173,18 @@ static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeV } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16sc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h index 9c2a48835..e063ae432 100644 --- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h +++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h @@ -161,7 +161,19 @@ static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVec } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16sc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16sc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16sc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h index 4e64d8c22..4e590e120 100644 --- a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h +++ b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h @@ -140,7 +140,19 @@ static inline void volk_32fc_magnitude_16s_aligned16_generic(int16_t* magnitudeV } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_32fc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_32fc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + volk_32fc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h index 7a8fd1ef9..3ea62da6a 100644 --- a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h +++ b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h @@ -115,7 +115,18 @@ static inline void volk_32fc_magnitude_32f_aligned16_generic(float* magnitudeVec } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +extern void volk_32fc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); +static inline void volk_32fc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + volk_32fc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_or_aligned16.h b/volk/include/volk/volk_32s_or_aligned16.h index f4c427c4d..64748d535 100644 --- a/volk/include/volk/volk_32s_or_aligned16.h +++ b/volk/include/volk/volk_32s_or_aligned16.h @@ -63,7 +63,19 @@ static inline void volk_32s_or_aligned16_generic(int32_t* cVector, const int32_t } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +extern void volk_32s_or_aligned16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32s_or_aligned16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32s_or_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_32s_OR_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc index b14610757..c8f13ff84 100644 --- a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc +++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc @@ -23,6 +23,7 @@ void qa_16sc_magnitude_16s_aligned16::t1() { std::complex input0[vlen] __attribute__ ((aligned (16))); int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_orc[vlen] __attribute__ ((aligned (16))); int16_t output_sse[vlen] __attribute__ ((aligned (16))); int16_t output_sse3[vlen] __attribute__ ((aligned (16))); @@ -40,6 +41,13 @@ void qa_16sc_magnitude_16s_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_16s_aligned16_manual(output_orc, input0, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_16sc_magnitude_16s_aligned16_manual(output_sse, input0, vlen, "sse"); } @@ -64,6 +72,7 @@ void qa_16sc_magnitude_16s_aligned16::t1() { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], 1.1); } } diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc index 2c9e48f6e..e7178863c 100644 --- a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc +++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc @@ -15,6 +15,7 @@ void qa_16sc_magnitude_32f_aligned16::t1() { std::complex input0[vlen] __attribute__ ((aligned (16))); float output_generic[vlen] __attribute__ ((aligned (16))); + float output_orc[vlen] __attribute__ ((aligned (16))); float output_known[vlen] __attribute__ ((aligned (16))); int16_t* inputLoad = (int16_t*)input0; @@ -37,6 +38,14 @@ void qa_16sc_magnitude_32f_aligned16::t1() { end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_32f_aligned16_manual(output_orc, input0, scale, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); /* for(int i = 0; i < 100; ++i) { @@ -48,6 +57,7 @@ void qa_16sc_magnitude_32f_aligned16::t1() { for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_known[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_orc[i], output_known[i], fabs(output_generic[i])*1e-4); } } @@ -63,6 +73,7 @@ void qa_16sc_magnitude_32f_aligned16::t1() { std::complex input0[vlen] __attribute__ ((aligned (16))); float output_generic[vlen] __attribute__ ((aligned (16))); + float output_orc[vlen] __attribute__ ((aligned (16))); float output_sse[vlen] __attribute__ ((aligned (16))); float output_sse3[vlen] __attribute__ ((aligned (16))); @@ -79,6 +90,14 @@ void qa_16sc_magnitude_32f_aligned16::t1() { end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_magnitude_32f_aligned16_manual(output_orc, input0, 32768.0, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_16sc_magnitude_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); @@ -104,6 +123,7 @@ void qa_16sc_magnitude_32f_aligned16::t1() { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4); } } diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc index f104e0443..b2c2ecf9a 100644 --- a/volk/lib/qa_32f_divide_aligned16.cc +++ b/volk/lib/qa_32f_divide_aligned16.cc @@ -88,6 +88,7 @@ void qa_32f_divide_aligned16::t1() { float output0[vlen] __attribute__ ((aligned (16))); float output01[vlen] __attribute__ ((aligned (16))); + float output02[vlen] __attribute__ ((aligned (16))); for(int i = 0; i < vlen; ++i) { input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); @@ -103,6 +104,13 @@ void qa_32f_divide_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_divide_aligned16_manual(output02, input0, input1, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_32f_divide_aligned16_manual(output01, input0, input1, vlen, "sse"); } @@ -117,6 +125,7 @@ void qa_32f_divide_aligned16::t1() { for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); } } diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc index a4be1616b..c3e65866b 100644 --- a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc +++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc @@ -23,6 +23,7 @@ void qa_32fc_magnitude_16s_aligned16::t1() { std::complex input0[vlen] __attribute__ ((aligned (16))); int16_t output_generic[vlen] __attribute__ ((aligned (16))); + int16_t output_orc[vlen] __attribute__ ((aligned (16))); int16_t output_sse[vlen] __attribute__ ((aligned (16))); int16_t output_sse3[vlen] __attribute__ ((aligned (16))); @@ -40,6 +41,13 @@ void qa_32fc_magnitude_16s_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_16s_aligned16_manual(output_orc, input0, 32768.0, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_32fc_magnitude_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); } @@ -64,6 +72,7 @@ void qa_32fc_magnitude_16s_aligned16::t1() { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], 1.1); } } diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc index d69ada408..6a1d46c7a 100644 --- a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc +++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc @@ -23,6 +23,7 @@ void qa_32fc_magnitude_32f_aligned16::t1() { std::complex input0[vlen] __attribute__ ((aligned (16))); float output_generic[vlen] __attribute__ ((aligned (16))); + float output_orc[vlen] __attribute__ ((aligned (16))); float output_sse[vlen] __attribute__ ((aligned (16))); float output_sse3[vlen] __attribute__ ((aligned (16))); @@ -40,6 +41,13 @@ void qa_32fc_magnitude_32f_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_magnitude_32f_aligned16_manual(output_orc, input0, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_32fc_magnitude_32f_aligned16_manual(output_sse, input0, vlen, "sse"); } @@ -64,6 +72,7 @@ void qa_32fc_magnitude_32f_aligned16::t1() { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4); } } diff --git a/volk/lib/qa_32s_or_aligned16.cc b/volk/lib/qa_32s_or_aligned16.cc index e09dfb91c..9ea5283a6 100644 --- a/volk/lib/qa_32s_or_aligned16.cc +++ b/volk/lib/qa_32s_or_aligned16.cc @@ -25,6 +25,7 @@ void qa_32s_or_aligned16::t1() { int32_t output0[vlen] __attribute__ ((aligned (16))); int32_t output01[vlen] __attribute__ ((aligned (16))); + int32_t output02[vlen] __attribute__ ((aligned (16))); for(int i = 0; i < vlen; ++i) { input0[i] = ((int32_t) (rand() - (RAND_MAX/2))); @@ -40,6 +41,13 @@ void qa_32s_or_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32s_or_aligned16_manual(output02, input0, input1, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_32s_or_aligned16_manual(output01, input0, input1, vlen, "sse"); } @@ -54,6 +62,7 @@ void qa_32s_or_aligned16::t1() { for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); } } diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am index c71625d87..3f105fd80 100644 --- a/volk/orc/Makefile.am +++ b/volk/orc/Makefile.am @@ -29,11 +29,15 @@ volk_8s_convert_16s_aligned16_orc_impl.orc \ volk_8s_convert_32f_aligned16_orc_impl.orc \ volk_16u_byteswap_aligned16_orc_impl.orc \ volk_32s_and_aligned16_orc_impl.orc \ +volk_32s_or_aligned16_orc_impl.orc \ volk_32f_add_aligned16_orc_impl.orc \ volk_32f_subtract_aligned16_orc_impl.orc \ volk_32f_divide_aligned16_orc_impl.orc \ volk_32f_multiply_aligned16_orc_impl.orc \ -volk_32f_sqrt_aligned16_orc_impl.orc +volk_32f_sqrt_aligned16_orc_impl.orc \ +volk_16sc_magnitude_32f_aligned16_orc_impl.orc \ +volk_32fc_magnitude_32f_aligned16_orc_impl.orc \ +volk_32fc_magnitude_16s_aligned16_orc_impl.orc diff --git a/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc new file mode 100644 index 000000000..f6c959c00 --- /dev/null +++ b/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc @@ -0,0 +1,24 @@ +.function volk_16sc_magnitude_16s_aligned16_orc_impl +.source 4 src +.dest 2 dst +.temp 2 reals +.temp 2 imags +.temp 4 reall +.temp 4 imagl +.temp 4 realf +.temp 4 imagf +.temp 4 sumf +.temp 4 rootf +.temp 4 rootl + +splitlw reals, imags, src +convwl reall, reals +convwl imagl, imags +convlf realf, reall +convlf imagf, imagl +mulf realf, realf, (1.0 / 32768.0) +mulf imagf, imagf, (1.0 / 32768.0) +addf sumf, realf, imagf +sqrtf rootf, sumf +convfl rootl, rootf +conflw dst, rootl diff --git a/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc new file mode 100644 index 000000000..44654ad8e --- /dev/null +++ b/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc @@ -0,0 +1,25 @@ +.function volk_16sc_magnitude_32f_aligned16_orc_impl +.source 4 src +.dest 4 dst +.floatparam 4 scalar +.temp 4 invscalar +.temp 4 reall +.temp 4 imagl +.temp 2 reals +.temp 2 imags +.temp 4 realf +.temp 4 imagf +.temp 4 sumf + +divf invscalar, 1.0, scalar +splitlw reals, imags, src +convswl reall, reals +convswl imagl, imags +convlf realf, reall +convlf imagf, imagl +mulf realf, realf, invscalar +mulf imagf, imagf, invscalar +mulf realf, realf, realf +mulf imagf, imagf, imagf +addf sumf, realf, imagf +sqrtf dst, sumf diff --git a/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc new file mode 100644 index 000000000..db8405e59 --- /dev/null +++ b/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc @@ -0,0 +1,25 @@ +.function volk_32fc_magnitude_16s_aligned16_orc_impl +.source 8 src +.dest 2 dst +.floatparam 4 scalar +.temp 4 invscalar +.temp 4 reall +.temp 4 imagl +.temp 4 realf +.temp 4 imagf +.temp 4 sumf +.temp 4 rootf +.temp 4 rootl + +divf invscalar, 1.0, scalar +splitql reall, imagl, src +convlf realf, reall +convlf imagf, imagl +mulf realf, realf, invscalar +mulf imagf, imagf, invscalar +mulf realf, realf, realf +mulf imagf, imagf, imagf +addf sumf, realf, imagf +sqrtf rootf, sumf +convfl rootl, rootf +convlw dst, rootl diff --git a/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc new file mode 100644 index 000000000..cc5c85b45 --- /dev/null +++ b/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc @@ -0,0 +1,21 @@ +.function volk_32fc_magnitude_32f_aligned16_orc_impl +.source 8 src +.dest 4 dst +.floatparam 4 scalar +.temp 4 invscalar +.temp 4 reall +.temp 4 imagl +.temp 4 realf +.temp 4 imagf +.temp 4 sumf + +divf invscalar, 1.0, scalar +splitql reall, imagl, src +convlf realf, reall +convlf imagf, imagl +mulf realf, realf, invscalar +mulf imagf, imagf, invscalar +mulf realf, realf, realf +mulf imagf, imagf, imagf +addf sumf, realf, imagf +sqrtf dst, sumf diff --git a/volk/orc/volk_32s_or_aligned16_orc_impl.orc b/volk/orc/volk_32s_or_aligned16_orc_impl.orc new file mode 100644 index 000000000..6d2a3859a --- /dev/null +++ b/volk/orc/volk_32s_or_aligned16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32s_or_aligned16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +orl dst, src1, src2 -- cgit From 320c16f5acdcbca6f59c735ad19619d7b82c434b Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 16 Dec 2010 21:27:38 -0500 Subject: Changed python env variable to more globally usable version. --- volk/include/volk/volk_register.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'volk') diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index d00501a93..9fded9a3e 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -1,4 +1,4 @@ -#! /usr/bin/python +#! /usr/bin/env python import sys import re -- cgit From ce3e4c33d170b65cf288faec7d8da6a496eb6101 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 16 Dec 2010 21:33:54 -0500 Subject: Including time header to qa files. --- volk/lib/qa_16s_add_quad_aligned16.cc | 2 +- volk/lib/qa_16s_branch_4_state_8_aligned16.cc | 2 +- volk/lib/qa_16s_convert_32f_aligned16.cc | 1 + volk/lib/qa_16s_convert_32f_unaligned16.cc | 1 + volk/lib/qa_16s_convert_8s_aligned16.cc | 1 + volk/lib/qa_16s_convert_8s_unaligned16.cc | 1 + volk/lib/qa_16s_max_star_aligned16.cc | 2 +- volk/lib/qa_16s_max_star_horizontal_aligned16.cc | 2 +- volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc | 2 +- volk/lib/qa_16s_quad_max_star_aligned16.cc | 1 + volk/lib/qa_16sc_deinterleave_16s_aligned16.cc | 1 + volk/lib/qa_16sc_deinterleave_32f_aligned16.cc | 1 + volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc | 1 + volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc | 1 + volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc | 1 + volk/lib/qa_16sc_magnitude_16s_aligned16.cc | 1 + volk/lib/qa_16sc_magnitude_32f_aligned16.cc | 1 + volk/lib/qa_16u_byteswap_aligned16.cc | 1 + volk/lib/qa_32f_accumulator_aligned16.cc | 1 + volk/lib/qa_32f_add_aligned16.cc | 1 + volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc | 1 + volk/lib/qa_32f_convert_16s_aligned16.cc | 1 + volk/lib/qa_32f_convert_16s_unaligned16.cc | 1 + volk/lib/qa_32f_convert_32s_aligned16.cc | 1 + volk/lib/qa_32f_convert_32s_unaligned16.cc | 1 + volk/lib/qa_32f_convert_64f_aligned16.cc | 1 + volk/lib/qa_32f_convert_64f_unaligned16.cc | 1 + volk/lib/qa_32f_convert_8s_aligned16.cc | 1 + volk/lib/qa_32f_convert_8s_unaligned16.cc | 1 + volk/lib/qa_32f_divide_aligned16.cc | 1 + volk/lib/qa_32f_fm_detect_aligned16.cc | 1 + volk/lib/qa_32f_interleave_16sc_aligned16.cc | 1 + volk/lib/qa_32f_interleave_32fc_aligned16.cc | 1 + volk/lib/qa_32f_max_aligned16.cc | 1 + volk/lib/qa_32f_min_aligned16.cc | 1 + volk/lib/qa_32f_multiply_aligned16.cc | 1 + volk/lib/qa_32f_normalize_aligned16.cc | 1 + volk/lib/qa_32f_sqrt_aligned16.cc | 1 + volk/lib/qa_32f_stddev_aligned16.cc | 1 + volk/lib/qa_32f_stddev_and_mean_aligned16.cc | 1 + volk/lib/qa_32f_subtract_aligned16.cc | 1 + volk/lib/qa_32fc_atan2_32f_aligned16.cc | 1 + volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc | 1 + volk/lib/qa_32fc_deinterleave_32f_aligned16.cc | 1 + volk/lib/qa_32fc_deinterleave_64f_aligned16.cc | 1 + volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc | 1 + volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc | 1 + volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc | 1 + volk/lib/qa_32fc_magnitude_16s_aligned16.cc | 1 + volk/lib/qa_32fc_magnitude_32f_aligned16.cc | 1 + volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc | 1 + volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc | 1 + volk/lib/qa_32s_and_aligned16.cc | 1 + volk/lib/qa_32s_convert_32f_aligned16.cc | 1 + volk/lib/qa_32s_convert_32f_unaligned16.cc | 1 + volk/lib/qa_32s_or_aligned16.cc | 1 + volk/lib/qa_32u_byteswap_aligned16.cc | 1 + volk/lib/qa_32u_popcnt_aligned16.cc | 1 + volk/lib/qa_64f_convert_32f_aligned16.cc | 1 + volk/lib/qa_64f_convert_32f_unaligned16.cc | 1 + volk/lib/qa_64f_max_aligned16.cc | 1 + volk/lib/qa_64f_min_aligned16.cc | 1 + volk/lib/qa_64u_byteswap_aligned16.cc | 1 + volk/lib/qa_64u_popcnt_aligned16.cc | 1 + volk/lib/qa_8s_convert_16s_aligned16.cc | 1 + volk/lib/qa_8s_convert_16s_unaligned16.cc | 1 + volk/lib/qa_8s_convert_32f_aligned16.cc | 1 + volk/lib/qa_8s_convert_32f_unaligned16.cc | 1 + volk/lib/qa_8sc_deinterleave_16s_aligned16.cc | 1 + volk/lib/qa_8sc_deinterleave_32f_aligned16.cc | 1 + volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc | 1 + volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc | 1 + volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc | 1 + volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc | 2 +- volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc | 2 +- 75 files changed, 75 insertions(+), 7 deletions(-) (limited to 'volk') diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc index c3005c1be..154aa0f17 100644 --- a/volk/lib/qa_16s_add_quad_aligned16.cc +++ b/volk/lib/qa_16s_add_quad_aligned16.cc @@ -2,7 +2,7 @@ #include #include #include -#include +#include //test for sse2 #ifndef LV_HAVE_SSE2 diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc index ba5e8ed93..62deffaeb 100644 --- a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc +++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc @@ -1,7 +1,7 @@ #include #include #include -#include +#include //test for ssse3 diff --git a/volk/lib/qa_16s_convert_32f_aligned16.cc b/volk/lib/qa_16s_convert_32f_aligned16.cc index 7878d4737..6215f4a64 100644 --- a/volk/lib/qa_16s_convert_32f_aligned16.cc +++ b/volk/lib/qa_16s_convert_32f_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_16s_convert_32f_unaligned16.cc b/volk/lib/qa_16s_convert_32f_unaligned16.cc index 8c3121e5c..46c2e48ac 100644 --- a/volk/lib/qa_16s_convert_32f_unaligned16.cc +++ b/volk/lib/qa_16s_convert_32f_unaligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_16s_convert_8s_aligned16.cc b/volk/lib/qa_16s_convert_8s_aligned16.cc index 734b7784e..8225aa0cf 100644 --- a/volk/lib/qa_16s_convert_8s_aligned16.cc +++ b/volk/lib/qa_16s_convert_8s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_16s_convert_8s_unaligned16.cc b/volk/lib/qa_16s_convert_8s_unaligned16.cc index 275ab7668..e6ce5030e 100644 --- a/volk/lib/qa_16s_convert_8s_unaligned16.cc +++ b/volk/lib/qa_16s_convert_8s_unaligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_16s_max_star_aligned16.cc b/volk/lib/qa_16s_max_star_aligned16.cc index b46b9ae8e..c6f828ba6 100644 --- a/volk/lib/qa_16s_max_star_aligned16.cc +++ b/volk/lib/qa_16s_max_star_aligned16.cc @@ -2,7 +2,7 @@ #include #include #include -#include +#include //test for ssse3 #ifndef LV_HAVE_SSSE3 diff --git a/volk/lib/qa_16s_max_star_horizontal_aligned16.cc b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc index 4d44735df..0a58570e2 100644 --- a/volk/lib/qa_16s_max_star_horizontal_aligned16.cc +++ b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc @@ -3,7 +3,7 @@ #include #include #include -#include +#include //test for ssse3 #ifndef LV_HAVE_SSSE3 diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc index 3c4f5c6cc..819b2256b 100644 --- a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc +++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc @@ -2,7 +2,7 @@ #include #include #include -#include +#include //test for sse2 diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc index 80a220c93..66f8c9afa 100644 --- a/volk/lib/qa_16s_quad_max_star_aligned16.cc +++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc index e700ac72c..c775e8596 100644 --- a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc +++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc index 6ee076998..b25094e89 100644 --- a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc +++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc index ca048ea67..c67064ea6 100644 --- a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc +++ b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc index 0f4ba6923..f86f03b88 100644 --- a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc +++ b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc index 5ab458bc9..dd446567e 100644 --- a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc +++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc index b14610757..9799ef43b 100644 --- a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc +++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc index 2c9e48f6e..1ebe644c5 100644 --- a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc +++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_16u_byteswap_aligned16.cc b/volk/lib/qa_16u_byteswap_aligned16.cc index 6b19828a4..ea117a820 100644 --- a/volk/lib/qa_16u_byteswap_aligned16.cc +++ b/volk/lib/qa_16u_byteswap_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_accumulator_aligned16.cc b/volk/lib/qa_32f_accumulator_aligned16.cc index ea637d600..0defef283 100644 --- a/volk/lib/qa_32f_accumulator_aligned16.cc +++ b/volk/lib/qa_32f_accumulator_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_add_aligned16.cc b/volk/lib/qa_32f_add_aligned16.cc index 002aebfc9..f80d562d4 100644 --- a/volk/lib/qa_32f_add_aligned16.cc +++ b/volk/lib/qa_32f_add_aligned16.cc @@ -21,6 +21,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc index 3c8137004..5d6987333 100644 --- a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc +++ b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_convert_16s_aligned16.cc b/volk/lib/qa_32f_convert_16s_aligned16.cc index 84a4c40c4..3e2452e68 100644 --- a/volk/lib/qa_32f_convert_16s_aligned16.cc +++ b/volk/lib/qa_32f_convert_16s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_convert_16s_unaligned16.cc b/volk/lib/qa_32f_convert_16s_unaligned16.cc index 9469daed2..e016b7ff7 100644 --- a/volk/lib/qa_32f_convert_16s_unaligned16.cc +++ b/volk/lib/qa_32f_convert_16s_unaligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_convert_32s_aligned16.cc b/volk/lib/qa_32f_convert_32s_aligned16.cc index ff24c7b0d..abceb52fb 100644 --- a/volk/lib/qa_32f_convert_32s_aligned16.cc +++ b/volk/lib/qa_32f_convert_32s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_convert_32s_unaligned16.cc b/volk/lib/qa_32f_convert_32s_unaligned16.cc index e63b17994..90f84b56f 100644 --- a/volk/lib/qa_32f_convert_32s_unaligned16.cc +++ b/volk/lib/qa_32f_convert_32s_unaligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_convert_64f_aligned16.cc b/volk/lib/qa_32f_convert_64f_aligned16.cc index c546e47de..1d0754ac9 100644 --- a/volk/lib/qa_32f_convert_64f_aligned16.cc +++ b/volk/lib/qa_32f_convert_64f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_convert_64f_unaligned16.cc b/volk/lib/qa_32f_convert_64f_unaligned16.cc index 24b51f9af..6f7d5066d 100644 --- a/volk/lib/qa_32f_convert_64f_unaligned16.cc +++ b/volk/lib/qa_32f_convert_64f_unaligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_convert_8s_aligned16.cc b/volk/lib/qa_32f_convert_8s_aligned16.cc index a3d4d6567..6a53629b5 100644 --- a/volk/lib/qa_32f_convert_8s_aligned16.cc +++ b/volk/lib/qa_32f_convert_8s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_convert_8s_unaligned16.cc b/volk/lib/qa_32f_convert_8s_unaligned16.cc index d885fd6bb..fbc5c20e6 100644 --- a/volk/lib/qa_32f_convert_8s_unaligned16.cc +++ b/volk/lib/qa_32f_convert_8s_unaligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc index 8826bf94f..3257a3751 100644 --- a/volk/lib/qa_32f_divide_aligned16.cc +++ b/volk/lib/qa_32f_divide_aligned16.cc @@ -21,6 +21,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc index ca65add28..592304f83 100644 --- a/volk/lib/qa_32f_fm_detect_aligned16.cc +++ b/volk/lib/qa_32f_fm_detect_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_interleave_16sc_aligned16.cc b/volk/lib/qa_32f_interleave_16sc_aligned16.cc index 2a937637f..a7ae60780 100644 --- a/volk/lib/qa_32f_interleave_16sc_aligned16.cc +++ b/volk/lib/qa_32f_interleave_16sc_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_interleave_32fc_aligned16.cc b/volk/lib/qa_32f_interleave_32fc_aligned16.cc index c22dd1046..333b6fce8 100644 --- a/volk/lib/qa_32f_interleave_32fc_aligned16.cc +++ b/volk/lib/qa_32f_interleave_32fc_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_max_aligned16.cc b/volk/lib/qa_32f_max_aligned16.cc index 3ef375176..ceb913cb4 100644 --- a/volk/lib/qa_32f_max_aligned16.cc +++ b/volk/lib/qa_32f_max_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_min_aligned16.cc b/volk/lib/qa_32f_min_aligned16.cc index 617e18b24..580a60e7d 100644 --- a/volk/lib/qa_32f_min_aligned16.cc +++ b/volk/lib/qa_32f_min_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_multiply_aligned16.cc b/volk/lib/qa_32f_multiply_aligned16.cc index e52748466..0c242b649 100644 --- a/volk/lib/qa_32f_multiply_aligned16.cc +++ b/volk/lib/qa_32f_multiply_aligned16.cc @@ -21,6 +21,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_normalize_aligned16.cc b/volk/lib/qa_32f_normalize_aligned16.cc index 2954fc3ae..1c7b485a6 100644 --- a/volk/lib/qa_32f_normalize_aligned16.cc +++ b/volk/lib/qa_32f_normalize_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_sqrt_aligned16.cc b/volk/lib/qa_32f_sqrt_aligned16.cc index 9a5f71de0..62d55767a 100644 --- a/volk/lib/qa_32f_sqrt_aligned16.cc +++ b/volk/lib/qa_32f_sqrt_aligned16.cc @@ -21,6 +21,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_stddev_aligned16.cc b/volk/lib/qa_32f_stddev_aligned16.cc index c0f22cdea..5934d70df 100644 --- a/volk/lib/qa_32f_stddev_aligned16.cc +++ b/volk/lib/qa_32f_stddev_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_stddev_and_mean_aligned16.cc b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc index dcad8bcf3..78c701d78 100644 --- a/volk/lib/qa_32f_stddev_and_mean_aligned16.cc +++ b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32f_subtract_aligned16.cc b/volk/lib/qa_32f_subtract_aligned16.cc index a7e1b5ae3..ffe4b504c 100644 --- a/volk/lib/qa_32f_subtract_aligned16.cc +++ b/volk/lib/qa_32f_subtract_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_atan2_32f_aligned16.cc b/volk/lib/qa_32fc_atan2_32f_aligned16.cc index a24382d71..c55ab5aa0 100644 --- a/volk/lib/qa_32fc_atan2_32f_aligned16.cc +++ b/volk/lib/qa_32fc_atan2_32f_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc index 497914e0a..2f9a30395 100644 --- a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc +++ b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include #define assertcomplexEqual(expected, actual, delta) \ diff --git a/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc index 0f5a030f5..72e084c05 100644 --- a/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc +++ b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc index 6e051afbc..89770c236 100644 --- a/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc +++ b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc index 850518524..7472476f7 100644 --- a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc +++ b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc index 321deb184..5cbdc49b3 100644 --- a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc +++ b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc index aedb2e387..4147e30ae 100644 --- a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc +++ b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc index a4be1616b..16984e30d 100644 --- a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc +++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc index d69ada408..b99f1ddcf 100644 --- a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc +++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc index 83cdf4b15..a3d0955bd 100644 --- a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc +++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse3 diff --git a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc index 4d1359068..1444c78a9 100644 --- a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc +++ b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse3 diff --git a/volk/lib/qa_32s_and_aligned16.cc b/volk/lib/qa_32s_and_aligned16.cc index 72d05cf6f..661801709 100644 --- a/volk/lib/qa_32s_and_aligned16.cc +++ b/volk/lib/qa_32s_and_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32s_convert_32f_aligned16.cc b/volk/lib/qa_32s_convert_32f_aligned16.cc index eab3fe016..07d799809 100644 --- a/volk/lib/qa_32s_convert_32f_aligned16.cc +++ b/volk/lib/qa_32s_convert_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32s_convert_32f_unaligned16.cc b/volk/lib/qa_32s_convert_32f_unaligned16.cc index 0e504cfa1..2ec610ffb 100644 --- a/volk/lib/qa_32s_convert_32f_unaligned16.cc +++ b/volk/lib/qa_32s_convert_32f_unaligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_32s_or_aligned16.cc b/volk/lib/qa_32s_or_aligned16.cc index e09dfb91c..9da2ae344 100644 --- a/volk/lib/qa_32s_or_aligned16.cc +++ b/volk/lib/qa_32s_or_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32u_byteswap_aligned16.cc b/volk/lib/qa_32u_byteswap_aligned16.cc index 8b1023876..313c786b6 100644 --- a/volk/lib/qa_32u_byteswap_aligned16.cc +++ b/volk/lib/qa_32u_byteswap_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc index 49fcddeb2..618a82a02 100644 --- a/volk/lib/qa_32u_popcnt_aligned16.cc +++ b/volk/lib/qa_32u_popcnt_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_64f_convert_32f_aligned16.cc b/volk/lib/qa_64f_convert_32f_aligned16.cc index 0eaebf00a..7f9c4584a 100644 --- a/volk/lib/qa_64f_convert_32f_aligned16.cc +++ b/volk/lib/qa_64f_convert_32f_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_64f_convert_32f_unaligned16.cc b/volk/lib/qa_64f_convert_32f_unaligned16.cc index dcf94bd27..98aadbf4d 100644 --- a/volk/lib/qa_64f_convert_32f_unaligned16.cc +++ b/volk/lib/qa_64f_convert_32f_unaligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse2 diff --git a/volk/lib/qa_64f_max_aligned16.cc b/volk/lib/qa_64f_max_aligned16.cc index 41ab078b0..76e755514 100644 --- a/volk/lib/qa_64f_max_aligned16.cc +++ b/volk/lib/qa_64f_max_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_64f_min_aligned16.cc b/volk/lib/qa_64f_min_aligned16.cc index b4664d065..4b70d2881 100644 --- a/volk/lib/qa_64f_min_aligned16.cc +++ b/volk/lib/qa_64f_min_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_64u_byteswap_aligned16.cc b/volk/lib/qa_64u_byteswap_aligned16.cc index 4f5d4d02b..20d012c9e 100644 --- a/volk/lib/qa_64u_byteswap_aligned16.cc +++ b/volk/lib/qa_64u_byteswap_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc index bce9ff6c2..85ef58795 100644 --- a/volk/lib/qa_64u_popcnt_aligned16.cc +++ b/volk/lib/qa_64u_popcnt_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_8s_convert_16s_aligned16.cc b/volk/lib/qa_8s_convert_16s_aligned16.cc index 35f08fb81..8dd5f76ca 100644 --- a/volk/lib/qa_8s_convert_16s_aligned16.cc +++ b/volk/lib/qa_8s_convert_16s_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse4_1 diff --git a/volk/lib/qa_8s_convert_16s_unaligned16.cc b/volk/lib/qa_8s_convert_16s_unaligned16.cc index bb326f818..12c502d4b 100644 --- a/volk/lib/qa_8s_convert_16s_unaligned16.cc +++ b/volk/lib/qa_8s_convert_16s_unaligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse4_1 diff --git a/volk/lib/qa_8s_convert_32f_aligned16.cc b/volk/lib/qa_8s_convert_32f_aligned16.cc index 522da0b9d..672f5662f 100644 --- a/volk/lib/qa_8s_convert_32f_aligned16.cc +++ b/volk/lib/qa_8s_convert_32f_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse4.1 diff --git a/volk/lib/qa_8s_convert_32f_unaligned16.cc b/volk/lib/qa_8s_convert_32f_unaligned16.cc index ea1fb7c74..43468b1b1 100644 --- a/volk/lib/qa_8s_convert_32f_unaligned16.cc +++ b/volk/lib/qa_8s_convert_32f_unaligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse4.1 diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc index 823e7fe2e..94e63e37d 100644 --- a/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc +++ b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc index fb580516c..29073eed7 100644 --- a/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc +++ b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc index 1cc844b52..4980c982a 100644 --- a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc +++ b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc index 10e537cde..3c3f737a1 100644 --- a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc +++ b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc @@ -3,6 +3,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc index d84df8119..a33d1bf30 100644 --- a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc +++ b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc @@ -2,6 +2,7 @@ #include #include #include +#include //test for sse diff --git a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc index d64eac8ce..216bf1cef 100644 --- a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc +++ b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc @@ -3,7 +3,7 @@ #include #include #include -#include +#include #define assertcomplexEqual(expected, actual, delta) \ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ diff --git a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc index c27f0e0ca..4c707446e 100644 --- a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc +++ b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc @@ -3,7 +3,7 @@ #include #include #include -#include +#include #define assertcomplexEqual(expected, actual, delta) \ CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ -- cgit From a89f5dd45263209c535c272b56abb99133a4d6c7 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 16 Dec 2010 21:49:45 -0500 Subject: Using a copy of config.guess and config.sub instead of sym links. --- volk/config.guess | 1506 +++++++++++++++++++++++++++++++++++++++++++++- volk/config.sub | 1740 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 3244 insertions(+), 2 deletions(-) mode change 120000 => 100755 volk/config.guess mode change 120000 => 100755 volk/config.sub (limited to 'volk') diff --git a/volk/config.guess b/volk/config.guess deleted file mode 120000 index 405bc3235..000000000 --- a/volk/config.guess +++ /dev/null @@ -1 +0,0 @@ -/usr/share/automake-1.11/config.guess \ No newline at end of file diff --git a/volk/config.guess b/volk/config.guess new file mode 100755 index 000000000..285237846 --- /dev/null +++ b/volk/config.guess @@ -0,0 +1,1505 @@ +#! /bin/sh +# Attempt to guess a canonical system name. +# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, +# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 +# Free Software Foundation, Inc. + +timestamp='2010-08-21' + +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA +# 02110-1301, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + + +# Originally written by Per Bothner. Please send patches (context +# diff format) to and include a ChangeLog +# entry. +# +# This script attempts to guess a canonical system name similar to +# config.sub. If it succeeds, it prints the system name on stdout, and +# exits with 0. Otherwise, it exits with 1. +# +# You can get the latest version of this script from: +# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD + +me=`echo "$0" | sed -e 's,.*/,,'` + +usage="\ +Usage: $0 [OPTION] + +Output the configuration name of the system \`$me' is run on. + +Operation modes: + -h, --help print this help, then exit + -t, --time-stamp print date of last modification, then exit + -v, --version print version number, then exit + +Report bugs and patches to ." + +version="\ +GNU config.guess ($timestamp) + +Originally written by Per Bothner. +Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, +2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free +Software Foundation, Inc. + +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + +help=" +Try \`$me --help' for more information." + +# Parse command line +while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) + echo "$timestamp" ; exit ;; + --version | -v ) + echo "$version" ; exit ;; + --help | --h* | -h ) + echo "$usage"; exit ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. + break ;; + -* ) + echo "$me: invalid option $1$help" >&2 + exit 1 ;; + * ) + break ;; + esac +done + +if test $# != 0; then + echo "$me: too many arguments$help" >&2 + exit 1 +fi + +trap 'exit 1' HUP INT TERM + +# CC_FOR_BUILD -- compiler used by this script. Note that the use of a +# compiler to aid in system detection is discouraged as it requires +# temporary files to be created and, as you can see below, it is a +# headache to deal with in a portable fashion. + +# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still +# use `HOST_CC' if defined, but it is deprecated. + +# Portable tmp directory creation inspired by the Autoconf team. + +set_cc_for_build=' +trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ; +trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" HUP INT PIPE TERM ; +: ${TMPDIR=/tmp} ; + { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || + { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } || + { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } || + { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ; +dummy=$tmp/dummy ; +tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; +case $CC_FOR_BUILD,$HOST_CC,$CC in + ,,) echo "int x;" > $dummy.c ; + for c in cc gcc c89 c99 ; do + if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then + CC_FOR_BUILD="$c"; break ; + fi ; + done ; + if test x"$CC_FOR_BUILD" = x ; then + CC_FOR_BUILD=no_compiler_found ; + fi + ;; + ,,*) CC_FOR_BUILD=$CC ;; + ,*,*) CC_FOR_BUILD=$HOST_CC ;; +esac ; set_cc_for_build= ;' + +# This is needed to find uname on a Pyramid OSx when run in the BSD universe. +# (ghazi@noc.rutgers.edu 1994-08-24) +if (test -f /.attbin/uname) >/dev/null 2>&1 ; then + PATH=$PATH:/.attbin ; export PATH +fi + +UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown +UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown +UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown +UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown + +# Note: order is significant - the case branches are not exclusive. + +case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in + *:NetBSD:*:*) + # NetBSD (nbsd) targets should (where applicable) match one or + # more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*, + # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently + # switched to ELF, *-*-netbsd* would select the old + # object file format. This provides both forward + # compatibility and a consistent mechanism for selecting the + # object file format. + # + # Note: NetBSD doesn't particularly care about the vendor + # portion of the name. We always set it to "unknown". + sysctl="sysctl -n hw.machine_arch" + UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \ + /usr/sbin/$sysctl 2>/dev/null || echo unknown)` + case "${UNAME_MACHINE_ARCH}" in + armeb) machine=armeb-unknown ;; + arm*) machine=arm-unknown ;; + sh3el) machine=shl-unknown ;; + sh3eb) machine=sh-unknown ;; + sh5el) machine=sh5le-unknown ;; + *) machine=${UNAME_MACHINE_ARCH}-unknown ;; + esac + # The Operating System including object format, if it has switched + # to ELF recently, or will in the future. + case "${UNAME_MACHINE_ARCH}" in + arm*|i386|m68k|ns32k|sh3*|sparc|vax) + eval $set_cc_for_build + if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep -q __ELF__ + then + # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). + # Return netbsd for either. FIX? + os=netbsd + else + os=netbsdelf + fi + ;; + *) + os=netbsd + ;; + esac + # The OS release + # Debian GNU/NetBSD machines have a different userland, and + # thus, need a distinct triplet. However, they do not need + # kernel version information, so it can be replaced with a + # suitable tag, in the style of linux-gnu. + case "${UNAME_VERSION}" in + Debian*) + release='-gnu' + ;; + *) + release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` + ;; + esac + # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: + # contains redundant information, the shorter form: + # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. + echo "${machine}-${os}${release}" + exit ;; + *:OpenBSD:*:*) + UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` + echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE} + exit ;; + *:ekkoBSD:*:*) + echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE} + exit ;; + *:SolidBSD:*:*) + echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE} + exit ;; + macppc:MirBSD:*:*) + echo powerpc-unknown-mirbsd${UNAME_RELEASE} + exit ;; + *:MirBSD:*:*) + echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE} + exit ;; + alpha:OSF1:*:*) + case $UNAME_RELEASE in + *4.0) + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` + ;; + *5.*) + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` + ;; + esac + # According to Compaq, /usr/sbin/psrinfo has been available on + # OSF/1 and Tru64 systems produced since 1995. I hope that + # covers most systems running today. This code pipes the CPU + # types through head -n 1, so we only detect the type of CPU 0. + ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` + case "$ALPHA_CPU_TYPE" in + "EV4 (21064)") + UNAME_MACHINE="alpha" ;; + "EV4.5 (21064)") + UNAME_MACHINE="alpha" ;; + "LCA4 (21066/21068)") + UNAME_MACHINE="alpha" ;; + "EV5 (21164)") + UNAME_MACHINE="alphaev5" ;; + "EV5.6 (21164A)") + UNAME_MACHINE="alphaev56" ;; + "EV5.6 (21164PC)") + UNAME_MACHINE="alphapca56" ;; + "EV5.7 (21164PC)") + UNAME_MACHINE="alphapca57" ;; + "EV6 (21264)") + UNAME_MACHINE="alphaev6" ;; + "EV6.7 (21264A)") + UNAME_MACHINE="alphaev67" ;; + "EV6.8CB (21264C)") + UNAME_MACHINE="alphaev68" ;; + "EV6.8AL (21264B)") + UNAME_MACHINE="alphaev68" ;; + "EV6.8CX (21264D)") + UNAME_MACHINE="alphaev68" ;; + "EV6.9A (21264/EV69A)") + UNAME_MACHINE="alphaev69" ;; + "EV7 (21364)") + UNAME_MACHINE="alphaev7" ;; + "EV7.9 (21364A)") + UNAME_MACHINE="alphaev79" ;; + esac + # A Pn.n version is a patched version. + # A Vn.n version is a released version. + # A Tn.n version is a released field test version. + # A Xn.n version is an unreleased experimental baselevel. + # 1.2 uses "1.2" for uname -r. + echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` + exit ;; + Alpha\ *:Windows_NT*:*) + # How do we know it's Interix rather than the generic POSIX subsystem? + # Should we change UNAME_MACHINE based on the output of uname instead + # of the specific Alpha model? + echo alpha-pc-interix + exit ;; + 21064:Windows_NT:50:3) + echo alpha-dec-winnt3.5 + exit ;; + Amiga*:UNIX_System_V:4.0:*) + echo m68k-unknown-sysv4 + exit ;; + *:[Aa]miga[Oo][Ss]:*:*) + echo ${UNAME_MACHINE}-unknown-amigaos + exit ;; + *:[Mm]orph[Oo][Ss]:*:*) + echo ${UNAME_MACHINE}-unknown-morphos + exit ;; + *:OS/390:*:*) + echo i370-ibm-openedition + exit ;; + *:z/VM:*:*) + echo s390-ibm-zvmoe + exit ;; + *:OS400:*:*) + echo powerpc-ibm-os400 + exit ;; + arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) + echo arm-acorn-riscix${UNAME_RELEASE} + exit ;; + arm:riscos:*:*|arm:RISCOS:*:*) + echo arm-unknown-riscos + exit ;; + SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) + echo hppa1.1-hitachi-hiuxmpp + exit ;; + Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) + # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. + if test "`(/bin/universe) 2>/dev/null`" = att ; then + echo pyramid-pyramid-sysv3 + else + echo pyramid-pyramid-bsd + fi + exit ;; + NILE*:*:*:dcosx) + echo pyramid-pyramid-svr4 + exit ;; + DRS?6000:unix:4.0:6*) + echo sparc-icl-nx6 + exit ;; + DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) + case `/usr/bin/uname -p` in + sparc) echo sparc-icl-nx7; exit ;; + esac ;; + s390x:SunOS:*:*) + echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + sun4H:SunOS:5.*:*) + echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) + echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*) + echo i386-pc-auroraux${UNAME_RELEASE} + exit ;; + i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) + eval $set_cc_for_build + SUN_ARCH="i386" + # If there is a compiler, see if it is configured for 64-bit objects. + # Note that the Sun cc does not turn __LP64__ into 1 like gcc does. + # This test works for both compilers. + if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then + if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \ + (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_64BIT_ARCH >/dev/null + then + SUN_ARCH="x86_64" + fi + fi + echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + sun4*:SunOS:6*:*) + # According to config.sub, this is the proper way to canonicalize + # SunOS6. Hard to guess exactly what SunOS6 will be like, but + # it's likely to be more like Solaris than SunOS4. + echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + sun4*:SunOS:*:*) + case "`/usr/bin/arch -k`" in + Series*|S4*) + UNAME_RELEASE=`uname -v` + ;; + esac + # Japanese Language versions have a version number like `4.1.3-JL'. + echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` + exit ;; + sun3*:SunOS:*:*) + echo m68k-sun-sunos${UNAME_RELEASE} + exit ;; + sun*:*:4.2BSD:*) + UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` + test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 + case "`/bin/arch`" in + sun3) + echo m68k-sun-sunos${UNAME_RELEASE} + ;; + sun4) + echo sparc-sun-sunos${UNAME_RELEASE} + ;; + esac + exit ;; + aushp:SunOS:*:*) + echo sparc-auspex-sunos${UNAME_RELEASE} + exit ;; + # The situation for MiNT is a little confusing. The machine name + # can be virtually everything (everything which is not + # "atarist" or "atariste" at least should have a processor + # > m68000). The system name ranges from "MiNT" over "FreeMiNT" + # to the lowercase version "mint" (or "freemint"). Finally + # the system name "TOS" denotes a system which is actually not + # MiNT. But MiNT is downward compatible to TOS, so this should + # be no problem. + atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit ;; + atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit ;; + *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit ;; + milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) + echo m68k-milan-mint${UNAME_RELEASE} + exit ;; + hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) + echo m68k-hades-mint${UNAME_RELEASE} + exit ;; + *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) + echo m68k-unknown-mint${UNAME_RELEASE} + exit ;; + m68k:machten:*:*) + echo m68k-apple-machten${UNAME_RELEASE} + exit ;; + powerpc:machten:*:*) + echo powerpc-apple-machten${UNAME_RELEASE} + exit ;; + RISC*:Mach:*:*) + echo mips-dec-mach_bsd4.3 + exit ;; + RISC*:ULTRIX:*:*) + echo mips-dec-ultrix${UNAME_RELEASE} + exit ;; + VAX*:ULTRIX*:*:*) + echo vax-dec-ultrix${UNAME_RELEASE} + exit ;; + 2020:CLIX:*:* | 2430:CLIX:*:*) + echo clipper-intergraph-clix${UNAME_RELEASE} + exit ;; + mips:*:*:UMIPS | mips:*:*:RISCos) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c +#ifdef __cplusplus +#include /* for printf() prototype */ + int main (int argc, char *argv[]) { +#else + int main (argc, argv) int argc; char *argv[]; { +#endif + #if defined (host_mips) && defined (MIPSEB) + #if defined (SYSTYPE_SYSV) + printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_SVR4) + printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) + printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); + #endif + #endif + exit (-1); + } +EOF + $CC_FOR_BUILD -o $dummy $dummy.c && + dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` && + SYSTEM_NAME=`$dummy $dummyarg` && + { echo "$SYSTEM_NAME"; exit; } + echo mips-mips-riscos${UNAME_RELEASE} + exit ;; + Motorola:PowerMAX_OS:*:*) + echo powerpc-motorola-powermax + exit ;; + Motorola:*:4.3:PL8-*) + echo powerpc-harris-powermax + exit ;; + Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) + echo powerpc-harris-powermax + exit ;; + Night_Hawk:Power_UNIX:*:*) + echo powerpc-harris-powerunix + exit ;; + m88k:CX/UX:7*:*) + echo m88k-harris-cxux7 + exit ;; + m88k:*:4*:R4*) + echo m88k-motorola-sysv4 + exit ;; + m88k:*:3*:R3*) + echo m88k-motorola-sysv3 + exit ;; + AViiON:dgux:*:*) + # DG/UX returns AViiON for all architectures + UNAME_PROCESSOR=`/usr/bin/uname -p` + if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ] + then + if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \ + [ ${TARGET_BINARY_INTERFACE}x = x ] + then + echo m88k-dg-dgux${UNAME_RELEASE} + else + echo m88k-dg-dguxbcs${UNAME_RELEASE} + fi + else + echo i586-dg-dgux${UNAME_RELEASE} + fi + exit ;; + M88*:DolphinOS:*:*) # DolphinOS (SVR3) + echo m88k-dolphin-sysv3 + exit ;; + M88*:*:R3*:*) + # Delta 88k system running SVR3 + echo m88k-motorola-sysv3 + exit ;; + XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) + echo m88k-tektronix-sysv3 + exit ;; + Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) + echo m68k-tektronix-bsd + exit ;; + *:IRIX*:*:*) + echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` + exit ;; + ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. + echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id + exit ;; # Note that: echo "'`uname -s`'" gives 'AIX ' + i*86:AIX:*:*) + echo i386-ibm-aix + exit ;; + ia64:AIX:*:*) + if [ -x /usr/bin/oslevel ] ; then + IBM_REV=`/usr/bin/oslevel` + else + IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} + fi + echo ${UNAME_MACHINE}-ibm-aix${IBM_REV} + exit ;; + *:AIX:2:3) + if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include + + main() + { + if (!__power_pc()) + exit(1); + puts("powerpc-ibm-aix3.2.5"); + exit(0); + } +EOF + if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` + then + echo "$SYSTEM_NAME" + else + echo rs6000-ibm-aix3.2.5 + fi + elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then + echo rs6000-ibm-aix3.2.4 + else + echo rs6000-ibm-aix3.2 + fi + exit ;; + *:AIX:*:[4567]) + IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` + if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then + IBM_ARCH=rs6000 + else + IBM_ARCH=powerpc + fi + if [ -x /usr/bin/oslevel ] ; then + IBM_REV=`/usr/bin/oslevel` + else + IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} + fi + echo ${IBM_ARCH}-ibm-aix${IBM_REV} + exit ;; + *:AIX:*:*) + echo rs6000-ibm-aix + exit ;; + ibmrt:4.4BSD:*|romp-ibm:BSD:*) + echo romp-ibm-bsd4.4 + exit ;; + ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and + echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to + exit ;; # report: romp-ibm BSD 4.3 + *:BOSX:*:*) + echo rs6000-bull-bosx + exit ;; + DPX/2?00:B.O.S.:*:*) + echo m68k-bull-sysv3 + exit ;; + 9000/[34]??:4.3bsd:1.*:*) + echo m68k-hp-bsd + exit ;; + hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) + echo m68k-hp-bsd4.4 + exit ;; + 9000/[34678]??:HP-UX:*:*) + HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` + case "${UNAME_MACHINE}" in + 9000/31? ) HP_ARCH=m68000 ;; + 9000/[34]?? ) HP_ARCH=m68k ;; + 9000/[678][0-9][0-9]) + if [ -x /usr/bin/getconf ]; then + sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` + sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` + case "${sc_cpu_version}" in + 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 + 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 + 532) # CPU_PA_RISC2_0 + case "${sc_kernel_bits}" in + 32) HP_ARCH="hppa2.0n" ;; + 64) HP_ARCH="hppa2.0w" ;; + '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20 + esac ;; + esac + fi + if [ "${HP_ARCH}" = "" ]; then + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + + #define _HPUX_SOURCE + #include + #include + + int main () + { + #if defined(_SC_KERNEL_BITS) + long bits = sysconf(_SC_KERNEL_BITS); + #endif + long cpu = sysconf (_SC_CPU_VERSION); + + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1"); break; + case CPU_PA_RISC2_0: + #if defined(_SC_KERNEL_BITS) + switch (bits) + { + case 64: puts ("hppa2.0w"); break; + case 32: puts ("hppa2.0n"); break; + default: puts ("hppa2.0"); break; + } break; + #else /* !defined(_SC_KERNEL_BITS) */ + puts ("hppa2.0"); break; + #endif + default: puts ("hppa1.0"); break; + } + exit (0); + } +EOF + (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` + test -z "$HP_ARCH" && HP_ARCH=hppa + fi ;; + esac + if [ ${HP_ARCH} = "hppa2.0w" ] + then + eval $set_cc_for_build + + # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating + # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler + # generating 64-bit code. GNU and HP use different nomenclature: + # + # $ CC_FOR_BUILD=cc ./config.guess + # => hppa2.0w-hp-hpux11.23 + # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess + # => hppa64-hp-hpux11.23 + + if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | + grep -q __LP64__ + then + HP_ARCH="hppa2.0w" + else + HP_ARCH="hppa64" + fi + fi + echo ${HP_ARCH}-hp-hpux${HPUX_REV} + exit ;; + ia64:HP-UX:*:*) + HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` + echo ia64-hp-hpux${HPUX_REV} + exit ;; + 3050*:HI-UX:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include + int + main () + { + long cpu = sysconf (_SC_CPU_VERSION); + /* The order matters, because CPU_IS_HP_MC68K erroneously returns + true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct + results, however. */ + if (CPU_IS_PA_RISC (cpu)) + { + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; + case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; + default: puts ("hppa-hitachi-hiuxwe2"); break; + } + } + else if (CPU_IS_HP_MC68K (cpu)) + puts ("m68k-hitachi-hiuxwe2"); + else puts ("unknown-hitachi-hiuxwe2"); + exit (0); + } +EOF + $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` && + { echo "$SYSTEM_NAME"; exit; } + echo unknown-hitachi-hiuxwe2 + exit ;; + 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) + echo hppa1.1-hp-bsd + exit ;; + 9000/8??:4.3bsd:*:*) + echo hppa1.0-hp-bsd + exit ;; + *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) + echo hppa1.0-hp-mpeix + exit ;; + hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) + echo hppa1.1-hp-osf + exit ;; + hp8??:OSF1:*:*) + echo hppa1.0-hp-osf + exit ;; + i*86:OSF1:*:*) + if [ -x /usr/sbin/sysversion ] ; then + echo ${UNAME_MACHINE}-unknown-osf1mk + else + echo ${UNAME_MACHINE}-unknown-osf1 + fi + exit ;; + parisc*:Lites*:*:*) + echo hppa1.1-hp-lites + exit ;; + C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) + echo c1-convex-bsd + exit ;; + C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit ;; + C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) + echo c34-convex-bsd + exit ;; + C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) + echo c38-convex-bsd + exit ;; + C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) + echo c4-convex-bsd + exit ;; + CRAY*Y-MP:*:*:*) + echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*[A-Z]90:*:*:*) + echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ + | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ + -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ + -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*TS:*:*:*) + echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*T3E:*:*:*) + echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*SV1:*:*:*) + echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + *:UNICOS/mp:*:*) + echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) + FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` + echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" + exit ;; + 5000:UNIX_System_V:4.*:*) + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` + echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" + exit ;; + i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) + echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} + exit ;; + sparc*:BSD/OS:*:*) + echo sparc-unknown-bsdi${UNAME_RELEASE} + exit ;; + *:BSD/OS:*:*) + echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE} + exit ;; + *:FreeBSD:*:*) + case ${UNAME_MACHINE} in + pc98) + echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + amd64) + echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + *) + echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + esac + exit ;; + i*:CYGWIN*:*) + echo ${UNAME_MACHINE}-pc-cygwin + exit ;; + *:MINGW*:*) + echo ${UNAME_MACHINE}-pc-mingw32 + exit ;; + i*:windows32*:*) + # uname -m includes "-pc" on this system. + echo ${UNAME_MACHINE}-mingw32 + exit ;; + i*:PW*:*) + echo ${UNAME_MACHINE}-pc-pw32 + exit ;; + *:Interix*:*) + case ${UNAME_MACHINE} in + x86) + echo i586-pc-interix${UNAME_RELEASE} + exit ;; + authenticamd | genuineintel | EM64T) + echo x86_64-unknown-interix${UNAME_RELEASE} + exit ;; + IA64) + echo ia64-unknown-interix${UNAME_RELEASE} + exit ;; + esac ;; + [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) + echo i${UNAME_MACHINE}-pc-mks + exit ;; + 8664:Windows_NT:*) + echo x86_64-pc-mks + exit ;; + i*:Windows_NT*:* | Pentium*:Windows_NT*:*) + # How do we know it's Interix rather than the generic POSIX subsystem? + # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we + # UNAME_MACHINE based on the output of uname instead of i386? + echo i586-pc-interix + exit ;; + i*:UWIN*:*) + echo ${UNAME_MACHINE}-pc-uwin + exit ;; + amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) + echo x86_64-unknown-cygwin + exit ;; + p*:CYGWIN*:*) + echo powerpcle-unknown-cygwin + exit ;; + prep*:SunOS:5.*:*) + echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + *:GNU:*:*) + # the GNU system + echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` + exit ;; + *:GNU/*:*:*) + # other systems with GNU libc and userland + echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu + exit ;; + i*86:Minix:*:*) + echo ${UNAME_MACHINE}-pc-minix + exit ;; + alpha:Linux:*:*) + case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in + EV5) UNAME_MACHINE=alphaev5 ;; + EV56) UNAME_MACHINE=alphaev56 ;; + PCA56) UNAME_MACHINE=alphapca56 ;; + PCA57) UNAME_MACHINE=alphapca56 ;; + EV6) UNAME_MACHINE=alphaev6 ;; + EV67) UNAME_MACHINE=alphaev67 ;; + EV68*) UNAME_MACHINE=alphaev68 ;; + esac + objdump --private-headers /bin/sh | grep -q ld.so.1 + if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi + echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} + exit ;; + arm*:Linux:*:*) + eval $set_cc_for_build + if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep -q __ARM_EABI__ + then + echo ${UNAME_MACHINE}-unknown-linux-gnu + else + echo ${UNAME_MACHINE}-unknown-linux-gnueabi + fi + exit ;; + avr32*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + cris:Linux:*:*) + echo cris-axis-linux-gnu + exit ;; + crisv32:Linux:*:*) + echo crisv32-axis-linux-gnu + exit ;; + frv:Linux:*:*) + echo frv-unknown-linux-gnu + exit ;; + i*86:Linux:*:*) + LIBC=gnu + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #ifdef __dietlibc__ + LIBC=dietlibc + #endif +EOF + eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'` + echo "${UNAME_MACHINE}-pc-linux-${LIBC}" + exit ;; + ia64:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + m32r*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + m68*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + mips:Linux:*:* | mips64:Linux:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #undef CPU + #undef ${UNAME_MACHINE} + #undef ${UNAME_MACHINE}el + #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) + CPU=${UNAME_MACHINE}el + #else + #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) + CPU=${UNAME_MACHINE} + #else + CPU= + #endif + #endif +EOF + eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'` + test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } + ;; + or32:Linux:*:*) + echo or32-unknown-linux-gnu + exit ;; + padre:Linux:*:*) + echo sparc-unknown-linux-gnu + exit ;; + parisc64:Linux:*:* | hppa64:Linux:*:*) + echo hppa64-unknown-linux-gnu + exit ;; + parisc:Linux:*:* | hppa:Linux:*:*) + # Look for CPU level + case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in + PA7*) echo hppa1.1-unknown-linux-gnu ;; + PA8*) echo hppa2.0-unknown-linux-gnu ;; + *) echo hppa-unknown-linux-gnu ;; + esac + exit ;; + ppc64:Linux:*:*) + echo powerpc64-unknown-linux-gnu + exit ;; + ppc:Linux:*:*) + echo powerpc-unknown-linux-gnu + exit ;; + s390:Linux:*:* | s390x:Linux:*:*) + echo ${UNAME_MACHINE}-ibm-linux + exit ;; + sh64*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + sh*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + sparc:Linux:*:* | sparc64:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + tile*:Linux:*:*) + echo ${UNAME_MACHINE}-tilera-linux-gnu + exit ;; + vax:Linux:*:*) + echo ${UNAME_MACHINE}-dec-linux-gnu + exit ;; + x86_64:Linux:*:*) + echo x86_64-unknown-linux-gnu + exit ;; + xtensa*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + i*86:DYNIX/ptx:4*:*) + # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. + # earlier versions are messed up and put the nodename in both + # sysname and nodename. + echo i386-sequent-sysv4 + exit ;; + i*86:UNIX_SV:4.2MP:2.*) + # Unixware is an offshoot of SVR4, but it has its own version + # number series starting with 2... + # I am not positive that other SVR4 systems won't match this, + # I just have to hope. -- rms. + # Use sysv4.2uw... so that sysv4* matches it. + echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} + exit ;; + i*86:OS/2:*:*) + # If we were able to find `uname', then EMX Unix compatibility + # is probably installed. + echo ${UNAME_MACHINE}-pc-os2-emx + exit ;; + i*86:XTS-300:*:STOP) + echo ${UNAME_MACHINE}-unknown-stop + exit ;; + i*86:atheos:*:*) + echo ${UNAME_MACHINE}-unknown-atheos + exit ;; + i*86:syllable:*:*) + echo ${UNAME_MACHINE}-pc-syllable + exit ;; + i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*) + echo i386-unknown-lynxos${UNAME_RELEASE} + exit ;; + i*86:*DOS:*:*) + echo ${UNAME_MACHINE}-pc-msdosdjgpp + exit ;; + i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) + UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` + if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then + echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL} + else + echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL} + fi + exit ;; + i*86:*:5:[678]*) + # UnixWare 7.x, OpenUNIX and OpenServer 6. + case `/bin/uname -X | grep "^Machine"` in + *486*) UNAME_MACHINE=i486 ;; + *Pentium) UNAME_MACHINE=i586 ;; + *Pent*|*Celeron) UNAME_MACHINE=i686 ;; + esac + echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} + exit ;; + i*86:*:3.2:*) + if test -f /usr/options/cb.name; then + UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then + UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` + (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 + (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ + && UNAME_MACHINE=i586 + (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ + && UNAME_MACHINE=i686 + (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ + && UNAME_MACHINE=i686 + echo ${UNAME_MACHINE}-pc-sco$UNAME_REL + else + echo ${UNAME_MACHINE}-pc-sysv32 + fi + exit ;; + pc:*:*:*) + # Left here for compatibility: + # uname -m prints for DJGPP always 'pc', but it prints nothing about + # the processor, so we play safe by assuming i586. + # Note: whatever this is, it MUST be the same as what config.sub + # prints for the "djgpp" host, or else GDB configury will decide that + # this is a cross-build. + echo i586-pc-msdosdjgpp + exit ;; + Intel:Mach:3*:*) + echo i386-pc-mach3 + exit ;; + paragon:*:*:*) + echo i860-intel-osf1 + exit ;; + i860:*:4.*:*) # i860-SVR4 + if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then + echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 + else # Add other i860-SVR4 vendors below as they are discovered. + echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4 + fi + exit ;; + mini*:CTIX:SYS*5:*) + # "miniframe" + echo m68010-convergent-sysv + exit ;; + mc68k:UNIX:SYSTEM5:3.51m) + echo m68k-convergent-sysv + exit ;; + M680?0:D-NIX:5.3:*) + echo m68k-diab-dnix + exit ;; + M68*:*:R3V[5678]*:*) + test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; + 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) + OS_REL='' + test -r /etc/.relid \ + && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4.3${OS_REL}; exit; } + /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ + && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; + 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4; exit; } ;; + NCR*:*:4.2:* | MPRAS*:*:4.2:*) + OS_REL='.3' + test -r /etc/.relid \ + && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4.3${OS_REL}; exit; } + /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ + && { echo i586-ncr-sysv4.3${OS_REL}; exit; } + /bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \ + && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; + m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) + echo m68k-unknown-lynxos${UNAME_RELEASE} + exit ;; + mc68030:UNIX_System_V:4.*:*) + echo m68k-atari-sysv4 + exit ;; + TSUNAMI:LynxOS:2.*:*) + echo sparc-unknown-lynxos${UNAME_RELEASE} + exit ;; + rs6000:LynxOS:2.*:*) + echo rs6000-unknown-lynxos${UNAME_RELEASE} + exit ;; + PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*) + echo powerpc-unknown-lynxos${UNAME_RELEASE} + exit ;; + SM[BE]S:UNIX_SV:*:*) + echo mips-dde-sysv${UNAME_RELEASE} + exit ;; + RM*:ReliantUNIX-*:*:*) + echo mips-sni-sysv4 + exit ;; + RM*:SINIX-*:*:*) + echo mips-sni-sysv4 + exit ;; + *:SINIX-*:*:*) + if uname -p 2>/dev/null >/dev/null ; then + UNAME_MACHINE=`(uname -p) 2>/dev/null` + echo ${UNAME_MACHINE}-sni-sysv4 + else + echo ns32k-sni-sysv + fi + exit ;; + PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort + # says + echo i586-unisys-sysv4 + exit ;; + *:UNIX_System_V:4*:FTX*) + # From Gerald Hewes . + # How about differentiating between stratus architectures? -djm + echo hppa1.1-stratus-sysv4 + exit ;; + *:*:*:FTX*) + # From seanf@swdc.stratus.com. + echo i860-stratus-sysv4 + exit ;; + i*86:VOS:*:*) + # From Paul.Green@stratus.com. + echo ${UNAME_MACHINE}-stratus-vos + exit ;; + *:VOS:*:*) + # From Paul.Green@stratus.com. + echo hppa1.1-stratus-vos + exit ;; + mc68*:A/UX:*:*) + echo m68k-apple-aux${UNAME_RELEASE} + exit ;; + news*:NEWS-OS:6*:*) + echo mips-sony-newsos6 + exit ;; + R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) + if [ -d /usr/nec ]; then + echo mips-nec-sysv${UNAME_RELEASE} + else + echo mips-unknown-sysv${UNAME_RELEASE} + fi + exit ;; + BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. + echo powerpc-be-beos + exit ;; + BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. + echo powerpc-apple-beos + exit ;; + BePC:BeOS:*:*) # BeOS running on Intel PC compatible. + echo i586-pc-beos + exit ;; + BePC:Haiku:*:*) # Haiku running on Intel PC compatible. + echo i586-pc-haiku + exit ;; + SX-4:SUPER-UX:*:*) + echo sx4-nec-superux${UNAME_RELEASE} + exit ;; + SX-5:SUPER-UX:*:*) + echo sx5-nec-superux${UNAME_RELEASE} + exit ;; + SX-6:SUPER-UX:*:*) + echo sx6-nec-superux${UNAME_RELEASE} + exit ;; + SX-7:SUPER-UX:*:*) + echo sx7-nec-superux${UNAME_RELEASE} + exit ;; + SX-8:SUPER-UX:*:*) + echo sx8-nec-superux${UNAME_RELEASE} + exit ;; + SX-8R:SUPER-UX:*:*) + echo sx8r-nec-superux${UNAME_RELEASE} + exit ;; + Power*:Rhapsody:*:*) + echo powerpc-apple-rhapsody${UNAME_RELEASE} + exit ;; + *:Rhapsody:*:*) + echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE} + exit ;; + *:Darwin:*:*) + UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown + case $UNAME_PROCESSOR in + i386) + eval $set_cc_for_build + if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then + if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ + (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_64BIT_ARCH >/dev/null + then + UNAME_PROCESSOR="x86_64" + fi + fi ;; + unknown) UNAME_PROCESSOR=powerpc ;; + esac + echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} + exit ;; + *:procnto*:*:* | *:QNX:[0123456789]*:*) + UNAME_PROCESSOR=`uname -p` + if test "$UNAME_PROCESSOR" = "x86"; then + UNAME_PROCESSOR=i386 + UNAME_MACHINE=pc + fi + echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE} + exit ;; + *:QNX:*:4*) + echo i386-pc-qnx + exit ;; + NSE-?:NONSTOP_KERNEL:*:*) + echo nse-tandem-nsk${UNAME_RELEASE} + exit ;; + NSR-?:NONSTOP_KERNEL:*:*) + echo nsr-tandem-nsk${UNAME_RELEASE} + exit ;; + *:NonStop-UX:*:*) + echo mips-compaq-nonstopux + exit ;; + BS2000:POSIX*:*:*) + echo bs2000-siemens-sysv + exit ;; + DS/*:UNIX_System_V:*:*) + echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE} + exit ;; + *:Plan9:*:*) + # "uname -m" is not consistent, so use $cputype instead. 386 + # is converted to i386 for consistency with other x86 + # operating systems. + if test "$cputype" = "386"; then + UNAME_MACHINE=i386 + else + UNAME_MACHINE="$cputype" + fi + echo ${UNAME_MACHINE}-unknown-plan9 + exit ;; + *:TOPS-10:*:*) + echo pdp10-unknown-tops10 + exit ;; + *:TENEX:*:*) + echo pdp10-unknown-tenex + exit ;; + KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) + echo pdp10-dec-tops20 + exit ;; + XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) + echo pdp10-xkl-tops20 + exit ;; + *:TOPS-20:*:*) + echo pdp10-unknown-tops20 + exit ;; + *:ITS:*:*) + echo pdp10-unknown-its + exit ;; + SEI:*:*:SEIUX) + echo mips-sei-seiux${UNAME_RELEASE} + exit ;; + *:DragonFly:*:*) + echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` + exit ;; + *:*VMS:*:*) + UNAME_MACHINE=`(uname -p) 2>/dev/null` + case "${UNAME_MACHINE}" in + A*) echo alpha-dec-vms ; exit ;; + I*) echo ia64-dec-vms ; exit ;; + V*) echo vax-dec-vms ; exit ;; + esac ;; + *:XENIX:*:SysV) + echo i386-pc-xenix + exit ;; + i*86:skyos:*:*) + echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//' + exit ;; + i*86:rdos:*:*) + echo ${UNAME_MACHINE}-pc-rdos + exit ;; + i*86:AROS:*:*) + echo ${UNAME_MACHINE}-pc-aros + exit ;; +esac + +#echo '(No uname command or uname output not recognized.)' 1>&2 +#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2 + +eval $set_cc_for_build +cat >$dummy.c < +# include +#endif +main () +{ +#if defined (sony) +#if defined (MIPSEB) + /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, + I don't know.... */ + printf ("mips-sony-bsd\n"); exit (0); +#else +#include + printf ("m68k-sony-newsos%s\n", +#ifdef NEWSOS4 + "4" +#else + "" +#endif + ); exit (0); +#endif +#endif + +#if defined (__arm) && defined (__acorn) && defined (__unix) + printf ("arm-acorn-riscix\n"); exit (0); +#endif + +#if defined (hp300) && !defined (hpux) + printf ("m68k-hp-bsd\n"); exit (0); +#endif + +#if defined (NeXT) +#if !defined (__ARCHITECTURE__) +#define __ARCHITECTURE__ "m68k" +#endif + int version; + version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; + if (version < 4) + printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); + else + printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); + exit (0); +#endif + +#if defined (MULTIMAX) || defined (n16) +#if defined (UMAXV) + printf ("ns32k-encore-sysv\n"); exit (0); +#else +#if defined (CMU) + printf ("ns32k-encore-mach\n"); exit (0); +#else + printf ("ns32k-encore-bsd\n"); exit (0); +#endif +#endif +#endif + +#if defined (__386BSD__) + printf ("i386-pc-bsd\n"); exit (0); +#endif + +#if defined (sequent) +#if defined (i386) + printf ("i386-sequent-dynix\n"); exit (0); +#endif +#if defined (ns32000) + printf ("ns32k-sequent-dynix\n"); exit (0); +#endif +#endif + +#if defined (_SEQUENT_) + struct utsname un; + + uname(&un); + + if (strncmp(un.version, "V2", 2) == 0) { + printf ("i386-sequent-ptx2\n"); exit (0); + } + if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ + printf ("i386-sequent-ptx1\n"); exit (0); + } + printf ("i386-sequent-ptx\n"); exit (0); + +#endif + +#if defined (vax) +# if !defined (ultrix) +# include +# if defined (BSD) +# if BSD == 43 + printf ("vax-dec-bsd4.3\n"); exit (0); +# else +# if BSD == 199006 + printf ("vax-dec-bsd4.3reno\n"); exit (0); +# else + printf ("vax-dec-bsd\n"); exit (0); +# endif +# endif +# else + printf ("vax-dec-bsd\n"); exit (0); +# endif +# else + printf ("vax-dec-ultrix\n"); exit (0); +# endif +#endif + +#if defined (alliant) && defined (i860) + printf ("i860-alliant-bsd\n"); exit (0); +#endif + + exit (1); +} +EOF + +$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` && + { echo "$SYSTEM_NAME"; exit; } + +# Apollos put the system type in the environment. + +test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; } + +# Convex versions that predate uname can use getsysinfo(1) + +if [ -x /usr/convex/getsysinfo ] +then + case `getsysinfo -f cpu_type` in + c1*) + echo c1-convex-bsd + exit ;; + c2*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit ;; + c34*) + echo c34-convex-bsd + exit ;; + c38*) + echo c38-convex-bsd + exit ;; + c4*) + echo c4-convex-bsd + exit ;; + esac +fi + +cat >&2 < in order to provide the needed +information to handle your system. + +config.guess timestamp = $timestamp + +uname -m = `(uname -m) 2>/dev/null || echo unknown` +uname -r = `(uname -r) 2>/dev/null || echo unknown` +uname -s = `(uname -s) 2>/dev/null || echo unknown` +uname -v = `(uname -v) 2>/dev/null || echo unknown` + +/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` +/bin/uname -X = `(/bin/uname -X) 2>/dev/null` + +hostinfo = `(hostinfo) 2>/dev/null` +/bin/universe = `(/bin/universe) 2>/dev/null` +/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` +/bin/arch = `(/bin/arch) 2>/dev/null` +/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` +/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` + +UNAME_MACHINE = ${UNAME_MACHINE} +UNAME_RELEASE = ${UNAME_RELEASE} +UNAME_SYSTEM = ${UNAME_SYSTEM} +UNAME_VERSION = ${UNAME_VERSION} +EOF + +exit 1 + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "timestamp='" +# time-stamp-format: "%:y-%02m-%02d" +# time-stamp-end: "'" +# End: diff --git a/volk/config.sub b/volk/config.sub deleted file mode 120000 index 4d47fbcbc..000000000 --- a/volk/config.sub +++ /dev/null @@ -1 +0,0 @@ -/usr/share/automake-1.11/config.sub \ No newline at end of file diff --git a/volk/config.sub b/volk/config.sub new file mode 100755 index 000000000..320e30388 --- /dev/null +++ b/volk/config.sub @@ -0,0 +1,1739 @@ +#! /bin/sh +# Configuration validation subroutine script. +# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, +# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 +# Free Software Foundation, Inc. + +timestamp='2010-09-11' + +# This file is (in principle) common to ALL GNU software. +# The presence of a machine in this file suggests that SOME GNU software +# can handle that machine. It does not imply ALL GNU software can. +# +# This file is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA +# 02110-1301, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + + +# Please send patches to . Submit a context +# diff and a properly formatted GNU ChangeLog entry. +# +# Configuration subroutine to validate and canonicalize a configuration type. +# Supply the specified configuration type as an argument. +# If it is invalid, we print an error message on stderr and exit with code 1. +# Otherwise, we print the canonical config type on stdout and succeed. + +# You can get the latest version of this script from: +# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD + +# This file is supposed to be the same for all GNU packages +# and recognize all the CPU types, system types and aliases +# that are meaningful with *any* GNU software. +# Each package is responsible for reporting which valid configurations +# it does not support. The user should be able to distinguish +# a failure to support a valid configuration from a meaningless +# configuration. + +# The goal of this file is to map all the various variations of a given +# machine specification into a single specification in the form: +# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM +# or in some cases, the newer four-part form: +# CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM +# It is wrong to echo any other type of specification. + +me=`echo "$0" | sed -e 's,.*/,,'` + +usage="\ +Usage: $0 [OPTION] CPU-MFR-OPSYS + $0 [OPTION] ALIAS + +Canonicalize a configuration name. + +Operation modes: + -h, --help print this help, then exit + -t, --time-stamp print date of last modification, then exit + -v, --version print version number, then exit + +Report bugs and patches to ." + +version="\ +GNU config.sub ($timestamp) + +Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, +2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free +Software Foundation, Inc. + +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + +help=" +Try \`$me --help' for more information." + +# Parse command line +while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) + echo "$timestamp" ; exit ;; + --version | -v ) + echo "$version" ; exit ;; + --help | --h* | -h ) + echo "$usage"; exit ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. + break ;; + -* ) + echo "$me: invalid option $1$help" + exit 1 ;; + + *local*) + # First pass through any local machine types. + echo $1 + exit ;; + + * ) + break ;; + esac +done + +case $# in + 0) echo "$me: missing argument$help" >&2 + exit 1;; + 1) ;; + *) echo "$me: too many arguments$help" >&2 + exit 1;; +esac + +# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any). +# Here we must recognize all the valid KERNEL-OS combinations. +maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` +case $maybe_os in + nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \ + linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \ + knetbsd*-gnu* | netbsd*-gnu* | \ + kopensolaris*-gnu* | \ + storm-chaos* | os2-emx* | rtmk-nova*) + os=-$maybe_os + basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` + ;; + *) + basic_machine=`echo $1 | sed 's/-[^-]*$//'` + if [ $basic_machine != $1 ] + then os=`echo $1 | sed 's/.*-/-/'` + else os=; fi + ;; +esac + +### Let's recognize common machines as not being operating systems so +### that things like config.sub decstation-3100 work. We also +### recognize some manufacturers as not being operating systems, so we +### can provide default operating systems below. +case $os in + -sun*os*) + # Prevent following clause from handling this invalid input. + ;; + -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \ + -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \ + -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \ + -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ + -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ + -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ + -apple | -axis | -knuth | -cray | -microblaze) + os= + basic_machine=$1 + ;; + -bluegene*) + os=-cnk + ;; + -sim | -cisco | -oki | -wec | -winbond) + os= + basic_machine=$1 + ;; + -scout) + ;; + -wrs) + os=-vxworks + basic_machine=$1 + ;; + -chorusos*) + os=-chorusos + basic_machine=$1 + ;; + -chorusrdb) + os=-chorusrdb + basic_machine=$1 + ;; + -hiux*) + os=-hiuxwe2 + ;; + -sco6) + os=-sco5v6 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco5) + os=-sco3.2v5 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco4) + os=-sco3.2v4 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco3.2.[4-9]*) + os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco3.2v[4-9]*) + # Don't forget version if it is 3.2v4 or newer. + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco5v6*) + # Don't forget version if it is 3.2v4 or newer. + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco*) + os=-sco3.2v2 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -udk*) + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -isc) + os=-isc2.2 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -clix*) + basic_machine=clipper-intergraph + ;; + -isc*) + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -lynx*) + os=-lynxos + ;; + -ptx*) + basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'` + ;; + -windowsnt*) + os=`echo $os | sed -e 's/windowsnt/winnt/'` + ;; + -psos*) + os=-psos + ;; + -mint | -mint[0-9]*) + basic_machine=m68k-atari + os=-mint + ;; +esac + +# Decode aliases for certain CPU-COMPANY combinations. +case $basic_machine in + # Recognize the basic CPU types without company name. + # Some are omitted here because they have special meanings below. + 1750a | 580 \ + | a29k \ + | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \ + | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \ + | am33_2.0 \ + | arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \ + | bfin \ + | c4x | clipper \ + | d10v | d30v | dlx | dsp16xx \ + | fido | fr30 | frv \ + | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ + | i370 | i860 | i960 | ia64 \ + | ip2k | iq2000 \ + | lm32 \ + | m32c | m32r | m32rle | m68000 | m68k | m88k \ + | maxq | mb | microblaze | mcore | mep | metag \ + | mips | mipsbe | mipseb | mipsel | mipsle \ + | mips16 \ + | mips64 | mips64el \ + | mips64octeon | mips64octeonel \ + | mips64orion | mips64orionel \ + | mips64r5900 | mips64r5900el \ + | mips64vr | mips64vrel \ + | mips64vr4100 | mips64vr4100el \ + | mips64vr4300 | mips64vr4300el \ + | mips64vr5000 | mips64vr5000el \ + | mips64vr5900 | mips64vr5900el \ + | mipsisa32 | mipsisa32el \ + | mipsisa32r2 | mipsisa32r2el \ + | mipsisa64 | mipsisa64el \ + | mipsisa64r2 | mipsisa64r2el \ + | mipsisa64sb1 | mipsisa64sb1el \ + | mipsisa64sr71k | mipsisa64sr71kel \ + | mipstx39 | mipstx39el \ + | mn10200 | mn10300 \ + | moxie \ + | mt \ + | msp430 \ + | nds32 | nds32le | nds32be \ + | nios | nios2 \ + | ns16k | ns32k \ + | or32 \ + | pdp10 | pdp11 | pj | pjl \ + | powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \ + | pyramid \ + | rx \ + | score \ + | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \ + | sh64 | sh64le \ + | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \ + | sparcv8 | sparcv9 | sparcv9b | sparcv9v \ + | spu | strongarm \ + | tahoe | thumb | tic4x | tic54x | tic55x | tic6x | tic80 | tron \ + | ubicom32 \ + | v850 | v850e \ + | we32k \ + | x86 | xc16x | xscale | xscalee[bl] | xstormy16 | xtensa \ + | z8k | z80) + basic_machine=$basic_machine-unknown + ;; + c54x) + basic_machine=tic54x-unknown + ;; + c55x) + basic_machine=tic55x-unknown + ;; + c6x) + basic_machine=tic6x-unknown + ;; + m6811 | m68hc11 | m6812 | m68hc12 | picochip) + # Motorola 68HC11/12. + basic_machine=$basic_machine-unknown + os=-none + ;; + m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k) + ;; + ms1) + basic_machine=mt-unknown + ;; + + # We use `pc' rather than `unknown' + # because (1) that's what they normally are, and + # (2) the word "unknown" tends to confuse beginning users. + i*86 | x86_64) + basic_machine=$basic_machine-pc + ;; + # Object if more than one company name word. + *-*-*) + echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 + exit 1 + ;; + # Recognize the basic CPU types with company name. + 580-* \ + | a29k-* \ + | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \ + | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \ + | alphapca5[67]-* | alpha64pca5[67]-* | arc-* \ + | arm-* | armbe-* | armle-* | armeb-* | armv*-* \ + | avr-* | avr32-* \ + | bfin-* | bs2000-* \ + | c[123]* | c30-* | [cjt]90-* | c4x-* \ + | clipper-* | craynv-* | cydra-* \ + | d10v-* | d30v-* | dlx-* \ + | elxsi-* \ + | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \ + | h8300-* | h8500-* \ + | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \ + | i*86-* | i860-* | i960-* | ia64-* \ + | ip2k-* | iq2000-* \ + | lm32-* \ + | m32c-* | m32r-* | m32rle-* \ + | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \ + | m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \ + | mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \ + | mips16-* \ + | mips64-* | mips64el-* \ + | mips64octeon-* | mips64octeonel-* \ + | mips64orion-* | mips64orionel-* \ + | mips64r5900-* | mips64r5900el-* \ + | mips64vr-* | mips64vrel-* \ + | mips64vr4100-* | mips64vr4100el-* \ + | mips64vr4300-* | mips64vr4300el-* \ + | mips64vr5000-* | mips64vr5000el-* \ + | mips64vr5900-* | mips64vr5900el-* \ + | mipsisa32-* | mipsisa32el-* \ + | mipsisa32r2-* | mipsisa32r2el-* \ + | mipsisa64-* | mipsisa64el-* \ + | mipsisa64r2-* | mipsisa64r2el-* \ + | mipsisa64sb1-* | mipsisa64sb1el-* \ + | mipsisa64sr71k-* | mipsisa64sr71kel-* \ + | mipstx39-* | mipstx39el-* \ + | mmix-* \ + | mt-* \ + | msp430-* \ + | nds32-* | nds32le-* | nds32be-* \ + | nios-* | nios2-* \ + | none-* | np1-* | ns16k-* | ns32k-* \ + | orion-* \ + | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \ + | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \ + | pyramid-* \ + | romp-* | rs6000-* | rx-* \ + | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \ + | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \ + | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \ + | sparclite-* \ + | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | strongarm-* | sv1-* | sx?-* \ + | tahoe-* | thumb-* \ + | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \ + | tile-* | tilegx-* \ + | tron-* \ + | ubicom32-* \ + | v850-* | v850e-* | vax-* \ + | we32k-* \ + | x86-* | x86_64-* | xc16x-* | xps100-* | xscale-* | xscalee[bl]-* \ + | xstormy16-* | xtensa*-* \ + | ymp-* \ + | z8k-* | z80-*) + ;; + # Recognize the basic CPU types without company name, with glob match. + xtensa*) + basic_machine=$basic_machine-unknown + ;; + # Recognize the various machine names and aliases which stand + # for a CPU type and a company and sometimes even an OS. + 386bsd) + basic_machine=i386-unknown + os=-bsd + ;; + 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) + basic_machine=m68000-att + ;; + 3b*) + basic_machine=we32k-att + ;; + a29khif) + basic_machine=a29k-amd + os=-udi + ;; + abacus) + basic_machine=abacus-unknown + ;; + adobe68k) + basic_machine=m68010-adobe + os=-scout + ;; + alliant | fx80) + basic_machine=fx80-alliant + ;; + altos | altos3068) + basic_machine=m68k-altos + ;; + am29k) + basic_machine=a29k-none + os=-bsd + ;; + amd64) + basic_machine=x86_64-pc + ;; + amd64-*) + basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + amdahl) + basic_machine=580-amdahl + os=-sysv + ;; + amiga | amiga-*) + basic_machine=m68k-unknown + ;; + amigaos | amigados) + basic_machine=m68k-unknown + os=-amigaos + ;; + amigaunix | amix) + basic_machine=m68k-unknown + os=-sysv4 + ;; + apollo68) + basic_machine=m68k-apollo + os=-sysv + ;; + apollo68bsd) + basic_machine=m68k-apollo + os=-bsd + ;; + aros) + basic_machine=i386-pc + os=-aros + ;; + aux) + basic_machine=m68k-apple + os=-aux + ;; + balance) + basic_machine=ns32k-sequent + os=-dynix + ;; + blackfin) + basic_machine=bfin-unknown + os=-linux + ;; + blackfin-*) + basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'` + os=-linux + ;; + bluegene*) + basic_machine=powerpc-ibm + os=-cnk + ;; + c54x-*) + basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + c55x-*) + basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + c6x-*) + basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + c90) + basic_machine=c90-cray + os=-unicos + ;; + cegcc) + basic_machine=arm-unknown + os=-cegcc + ;; + convex-c1) + basic_machine=c1-convex + os=-bsd + ;; + convex-c2) + basic_machine=c2-convex + os=-bsd + ;; + convex-c32) + basic_machine=c32-convex + os=-bsd + ;; + convex-c34) + basic_machine=c34-convex + os=-bsd + ;; + convex-c38) + basic_machine=c38-convex + os=-bsd + ;; + cray | j90) + basic_machine=j90-cray + os=-unicos + ;; + craynv) + basic_machine=craynv-cray + os=-unicosmp + ;; + cr16) + basic_machine=cr16-unknown + os=-elf + ;; + crds | unos) + basic_machine=m68k-crds + ;; + crisv32 | crisv32-* | etraxfs*) + basic_machine=crisv32-axis + ;; + cris | cris-* | etrax*) + basic_machine=cris-axis + ;; + crx) + basic_machine=crx-unknown + os=-elf + ;; + da30 | da30-*) + basic_machine=m68k-da30 + ;; + decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn) + basic_machine=mips-dec + ;; + decsystem10* | dec10*) + basic_machine=pdp10-dec + os=-tops10 + ;; + decsystem20* | dec20*) + basic_machine=pdp10-dec + os=-tops20 + ;; + delta | 3300 | motorola-3300 | motorola-delta \ + | 3300-motorola | delta-motorola) + basic_machine=m68k-motorola + ;; + delta88) + basic_machine=m88k-motorola + os=-sysv3 + ;; + dicos) + basic_machine=i686-pc + os=-dicos + ;; + djgpp) + basic_machine=i586-pc + os=-msdosdjgpp + ;; + dpx20 | dpx20-*) + basic_machine=rs6000-bull + os=-bosx + ;; + dpx2* | dpx2*-bull) + basic_machine=m68k-bull + os=-sysv3 + ;; + ebmon29k) + basic_machine=a29k-amd + os=-ebmon + ;; + elxsi) + basic_machine=elxsi-elxsi + os=-bsd + ;; + encore | umax | mmax) + basic_machine=ns32k-encore + ;; + es1800 | OSE68k | ose68k | ose | OSE) + basic_machine=m68k-ericsson + os=-ose + ;; + fx2800) + basic_machine=i860-alliant + ;; + genix) + basic_machine=ns32k-ns + ;; + gmicro) + basic_machine=tron-gmicro + os=-sysv + ;; + go32) + basic_machine=i386-pc + os=-go32 + ;; + h3050r* | hiux*) + basic_machine=hppa1.1-hitachi + os=-hiuxwe2 + ;; + h8300hms) + basic_machine=h8300-hitachi + os=-hms + ;; + h8300xray) + basic_machine=h8300-hitachi + os=-xray + ;; + h8500hms) + basic_machine=h8500-hitachi + os=-hms + ;; + harris) + basic_machine=m88k-harris + os=-sysv3 + ;; + hp300-*) + basic_machine=m68k-hp + ;; + hp300bsd) + basic_machine=m68k-hp + os=-bsd + ;; + hp300hpux) + basic_machine=m68k-hp + os=-hpux + ;; + hp3k9[0-9][0-9] | hp9[0-9][0-9]) + basic_machine=hppa1.0-hp + ;; + hp9k2[0-9][0-9] | hp9k31[0-9]) + basic_machine=m68000-hp + ;; + hp9k3[2-9][0-9]) + basic_machine=m68k-hp + ;; + hp9k6[0-9][0-9] | hp6[0-9][0-9]) + basic_machine=hppa1.0-hp + ;; + hp9k7[0-79][0-9] | hp7[0-79][0-9]) + basic_machine=hppa1.1-hp + ;; + hp9k78[0-9] | hp78[0-9]) + # FIXME: really hppa2.0-hp + basic_machine=hppa1.1-hp + ;; + hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893) + # FIXME: really hppa2.0-hp + basic_machine=hppa1.1-hp + ;; + hp9k8[0-9][13679] | hp8[0-9][13679]) + basic_machine=hppa1.1-hp + ;; + hp9k8[0-9][0-9] | hp8[0-9][0-9]) + basic_machine=hppa1.0-hp + ;; + hppa-next) + os=-nextstep3 + ;; + hppaosf) + basic_machine=hppa1.1-hp + os=-osf + ;; + hppro) + basic_machine=hppa1.1-hp + os=-proelf + ;; + i370-ibm* | ibm*) + basic_machine=i370-ibm + ;; +# I'm not sure what "Sysv32" means. Should this be sysv3.2? + i*86v32) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv32 + ;; + i*86v4*) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv4 + ;; + i*86v) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv + ;; + i*86sol2) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-solaris2 + ;; + i386mach) + basic_machine=i386-mach + os=-mach + ;; + i386-vsta | vsta) + basic_machine=i386-unknown + os=-vsta + ;; + iris | iris4d) + basic_machine=mips-sgi + case $os in + -irix*) + ;; + *) + os=-irix4 + ;; + esac + ;; + isi68 | isi) + basic_machine=m68k-isi + os=-sysv + ;; + m68knommu) + basic_machine=m68k-unknown + os=-linux + ;; + m68knommu-*) + basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'` + os=-linux + ;; + m88k-omron*) + basic_machine=m88k-omron + ;; + magnum | m3230) + basic_machine=mips-mips + os=-sysv + ;; + merlin) + basic_machine=ns32k-utek + os=-sysv + ;; + microblaze) + basic_machine=microblaze-xilinx + ;; + mingw32) + basic_machine=i386-pc + os=-mingw32 + ;; + mingw32ce) + basic_machine=arm-unknown + os=-mingw32ce + ;; + miniframe) + basic_machine=m68000-convergent + ;; + *mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*) + basic_machine=m68k-atari + os=-mint + ;; + mips3*-*) + basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'` + ;; + mips3*) + basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown + ;; + monitor) + basic_machine=m68k-rom68k + os=-coff + ;; + morphos) + basic_machine=powerpc-unknown + os=-morphos + ;; + msdos) + basic_machine=i386-pc + os=-msdos + ;; + ms1-*) + basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'` + ;; + mvs) + basic_machine=i370-ibm + os=-mvs + ;; + ncr3000) + basic_machine=i486-ncr + os=-sysv4 + ;; + netbsd386) + basic_machine=i386-unknown + os=-netbsd + ;; + netwinder) + basic_machine=armv4l-rebel + os=-linux + ;; + news | news700 | news800 | news900) + basic_machine=m68k-sony + os=-newsos + ;; + news1000) + basic_machine=m68030-sony + os=-newsos + ;; + news-3600 | risc-news) + basic_machine=mips-sony + os=-newsos + ;; + necv70) + basic_machine=v70-nec + os=-sysv + ;; + next | m*-next ) + basic_machine=m68k-next + case $os in + -nextstep* ) + ;; + -ns2*) + os=-nextstep2 + ;; + *) + os=-nextstep3 + ;; + esac + ;; + nh3000) + basic_machine=m68k-harris + os=-cxux + ;; + nh[45]000) + basic_machine=m88k-harris + os=-cxux + ;; + nindy960) + basic_machine=i960-intel + os=-nindy + ;; + mon960) + basic_machine=i960-intel + os=-mon960 + ;; + nonstopux) + basic_machine=mips-compaq + os=-nonstopux + ;; + np1) + basic_machine=np1-gould + ;; + neo-tandem) + basic_machine=neo-tandem + ;; + nse-tandem) + basic_machine=nse-tandem + ;; + nsr-tandem) + basic_machine=nsr-tandem + ;; + op50n-* | op60c-*) + basic_machine=hppa1.1-oki + os=-proelf + ;; + openrisc | openrisc-*) + basic_machine=or32-unknown + ;; + os400) + basic_machine=powerpc-ibm + os=-os400 + ;; + OSE68000 | ose68000) + basic_machine=m68000-ericsson + os=-ose + ;; + os68k) + basic_machine=m68k-none + os=-os68k + ;; + pa-hitachi) + basic_machine=hppa1.1-hitachi + os=-hiuxwe2 + ;; + paragon) + basic_machine=i860-intel + os=-osf + ;; + parisc) + basic_machine=hppa-unknown + os=-linux + ;; + parisc-*) + basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'` + os=-linux + ;; + pbd) + basic_machine=sparc-tti + ;; + pbb) + basic_machine=m68k-tti + ;; + pc532 | pc532-*) + basic_machine=ns32k-pc532 + ;; + pc98) + basic_machine=i386-pc + ;; + pc98-*) + basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentium | p5 | k5 | k6 | nexgen | viac3) + basic_machine=i586-pc + ;; + pentiumpro | p6 | 6x86 | athlon | athlon_*) + basic_machine=i686-pc + ;; + pentiumii | pentium2 | pentiumiii | pentium3) + basic_machine=i686-pc + ;; + pentium4) + basic_machine=i786-pc + ;; + pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*) + basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentiumpro-* | p6-* | 6x86-* | athlon-*) + basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*) + basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentium4-*) + basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pn) + basic_machine=pn-gould + ;; + power) basic_machine=power-ibm + ;; + ppc) basic_machine=powerpc-unknown + ;; + ppc-*) basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + ppcle | powerpclittle | ppc-le | powerpc-little) + basic_machine=powerpcle-unknown + ;; + ppcle-* | powerpclittle-*) + basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + ppc64) basic_machine=powerpc64-unknown + ;; + ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + ppc64le | powerpc64little | ppc64-le | powerpc64-little) + basic_machine=powerpc64le-unknown + ;; + ppc64le-* | powerpc64little-*) + basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + ps2) + basic_machine=i386-ibm + ;; + pw32) + basic_machine=i586-unknown + os=-pw32 + ;; + rdos) + basic_machine=i386-pc + os=-rdos + ;; + rom68k) + basic_machine=m68k-rom68k + os=-coff + ;; + rm[46]00) + basic_machine=mips-siemens + ;; + rtpc | rtpc-*) + basic_machine=romp-ibm + ;; + s390 | s390-*) + basic_machine=s390-ibm + ;; + s390x | s390x-*) + basic_machine=s390x-ibm + ;; + sa29200) + basic_machine=a29k-amd + os=-udi + ;; + sb1) + basic_machine=mipsisa64sb1-unknown + ;; + sb1el) + basic_machine=mipsisa64sb1el-unknown + ;; + sde) + basic_machine=mipsisa32-sde + os=-elf + ;; + sei) + basic_machine=mips-sei + os=-seiux + ;; + sequent) + basic_machine=i386-sequent + ;; + sh) + basic_machine=sh-hitachi + os=-hms + ;; + sh5el) + basic_machine=sh5le-unknown + ;; + sh64) + basic_machine=sh64-unknown + ;; + sparclite-wrs | simso-wrs) + basic_machine=sparclite-wrs + os=-vxworks + ;; + sps7) + basic_machine=m68k-bull + os=-sysv2 + ;; + spur) + basic_machine=spur-unknown + ;; + st2000) + basic_machine=m68k-tandem + ;; + stratus) + basic_machine=i860-stratus + os=-sysv4 + ;; + sun2) + basic_machine=m68000-sun + ;; + sun2os3) + basic_machine=m68000-sun + os=-sunos3 + ;; + sun2os4) + basic_machine=m68000-sun + os=-sunos4 + ;; + sun3os3) + basic_machine=m68k-sun + os=-sunos3 + ;; + sun3os4) + basic_machine=m68k-sun + os=-sunos4 + ;; + sun4os3) + basic_machine=sparc-sun + os=-sunos3 + ;; + sun4os4) + basic_machine=sparc-sun + os=-sunos4 + ;; + sun4sol2) + basic_machine=sparc-sun + os=-solaris2 + ;; + sun3 | sun3-*) + basic_machine=m68k-sun + ;; + sun4) + basic_machine=sparc-sun + ;; + sun386 | sun386i | roadrunner) + basic_machine=i386-sun + ;; + sv1) + basic_machine=sv1-cray + os=-unicos + ;; + symmetry) + basic_machine=i386-sequent + os=-dynix + ;; + t3e) + basic_machine=alphaev5-cray + os=-unicos + ;; + t90) + basic_machine=t90-cray + os=-unicos + ;; + # This must be matched before tile*. + tilegx*) + basic_machine=tilegx-unknown + os=-linux-gnu + ;; + tile*) + basic_machine=tile-unknown + os=-linux-gnu + ;; + tx39) + basic_machine=mipstx39-unknown + ;; + tx39el) + basic_machine=mipstx39el-unknown + ;; + toad1) + basic_machine=pdp10-xkl + os=-tops20 + ;; + tower | tower-32) + basic_machine=m68k-ncr + ;; + tpf) + basic_machine=s390x-ibm + os=-tpf + ;; + udi29k) + basic_machine=a29k-amd + os=-udi + ;; + ultra3) + basic_machine=a29k-nyu + os=-sym1 + ;; + v810 | necv810) + basic_machine=v810-nec + os=-none + ;; + vaxv) + basic_machine=vax-dec + os=-sysv + ;; + vms) + basic_machine=vax-dec + os=-vms + ;; + vpp*|vx|vx-*) + basic_machine=f301-fujitsu + ;; + vxworks960) + basic_machine=i960-wrs + os=-vxworks + ;; + vxworks68) + basic_machine=m68k-wrs + os=-vxworks + ;; + vxworks29k) + basic_machine=a29k-wrs + os=-vxworks + ;; + w65*) + basic_machine=w65-wdc + os=-none + ;; + w89k-*) + basic_machine=hppa1.1-winbond + os=-proelf + ;; + xbox) + basic_machine=i686-pc + os=-mingw32 + ;; + xps | xps100) + basic_machine=xps100-honeywell + ;; + ymp) + basic_machine=ymp-cray + os=-unicos + ;; + z8k-*-coff) + basic_machine=z8k-unknown + os=-sim + ;; + z80-*-coff) + basic_machine=z80-unknown + os=-sim + ;; + none) + basic_machine=none-none + os=-none + ;; + +# Here we handle the default manufacturer of certain CPU types. It is in +# some cases the only manufacturer, in others, it is the most popular. + w89k) + basic_machine=hppa1.1-winbond + ;; + op50n) + basic_machine=hppa1.1-oki + ;; + op60c) + basic_machine=hppa1.1-oki + ;; + romp) + basic_machine=romp-ibm + ;; + mmix) + basic_machine=mmix-knuth + ;; + rs6000) + basic_machine=rs6000-ibm + ;; + vax) + basic_machine=vax-dec + ;; + pdp10) + # there are many clones, so DEC is not a safe bet + basic_machine=pdp10-unknown + ;; + pdp11) + basic_machine=pdp11-dec + ;; + we32k) + basic_machine=we32k-att + ;; + sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele) + basic_machine=sh-unknown + ;; + sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v) + basic_machine=sparc-sun + ;; + cydra) + basic_machine=cydra-cydrome + ;; + orion) + basic_machine=orion-highlevel + ;; + orion105) + basic_machine=clipper-highlevel + ;; + mac | mpw | mac-mpw) + basic_machine=m68k-apple + ;; + pmac | pmac-mpw) + basic_machine=powerpc-apple + ;; + *-unknown) + # Make sure to match an already-canonicalized machine name. + ;; + *) + echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 + exit 1 + ;; +esac + +# Here we canonicalize certain aliases for manufacturers. +case $basic_machine in + *-digital*) + basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'` + ;; + *-commodore*) + basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'` + ;; + *) + ;; +esac + +# Decode manufacturer-specific aliases for certain operating systems. + +if [ x"$os" != x"" ] +then +case $os in + # First match some system type aliases + # that might get confused with valid system types. + # -solaris* is a basic system type, with this one exception. + -auroraux) + os=-auroraux + ;; + -solaris1 | -solaris1.*) + os=`echo $os | sed -e 's|solaris1|sunos4|'` + ;; + -solaris) + os=-solaris2 + ;; + -svr4*) + os=-sysv4 + ;; + -unixware*) + os=-sysv4.2uw + ;; + -gnu/linux*) + os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` + ;; + # First accept the basic system types. + # The portable systems comes first. + # Each alternative MUST END IN A *, to match a version number. + # -sysv* is not here because it comes later, after sysvr4. + -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ + | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ + | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ + | -sym* | -kopensolaris* \ + | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ + | -aos* | -aros* \ + | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ + | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ + | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \ + | -openbsd* | -solidbsd* \ + | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \ + | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ + | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ + | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \ + | -chorusos* | -chorusrdb* | -cegcc* \ + | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ + | -mingw32* | -linux-gnu* | -linux-android* \ + | -linux-newlib* | -linux-uclibc* \ + | -uxpv* | -beos* | -mpeix* | -udk* \ + | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \ + | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ + | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \ + | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \ + | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \ + | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \ + | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*) + # Remember, each alternative MUST END IN *, to match a version number. + ;; + -qnx*) + case $basic_machine in + x86-* | i*86-*) + ;; + *) + os=-nto$os + ;; + esac + ;; + -nto-qnx*) + ;; + -nto*) + os=`echo $os | sed -e 's|nto|nto-qnx|'` + ;; + -sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \ + | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \ + | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*) + ;; + -mac*) + os=`echo $os | sed -e 's|mac|macos|'` + ;; + -linux-dietlibc) + os=-linux-dietlibc + ;; + -linux*) + os=`echo $os | sed -e 's|linux|linux-gnu|'` + ;; + -sunos5*) + os=`echo $os | sed -e 's|sunos5|solaris2|'` + ;; + -sunos6*) + os=`echo $os | sed -e 's|sunos6|solaris3|'` + ;; + -opened*) + os=-openedition + ;; + -os400*) + os=-os400 + ;; + -wince*) + os=-wince + ;; + -osfrose*) + os=-osfrose + ;; + -osf*) + os=-osf + ;; + -utek*) + os=-bsd + ;; + -dynix*) + os=-bsd + ;; + -acis*) + os=-aos + ;; + -atheos*) + os=-atheos + ;; + -syllable*) + os=-syllable + ;; + -386bsd) + os=-bsd + ;; + -ctix* | -uts*) + os=-sysv + ;; + -nova*) + os=-rtmk-nova + ;; + -ns2 ) + os=-nextstep2 + ;; + -nsk*) + os=-nsk + ;; + # Preserve the version number of sinix5. + -sinix5.*) + os=`echo $os | sed -e 's|sinix|sysv|'` + ;; + -sinix*) + os=-sysv4 + ;; + -tpf*) + os=-tpf + ;; + -triton*) + os=-sysv3 + ;; + -oss*) + os=-sysv3 + ;; + -svr4) + os=-sysv4 + ;; + -svr3) + os=-sysv3 + ;; + -sysvr4) + os=-sysv4 + ;; + # This must come after -sysvr4. + -sysv*) + ;; + -ose*) + os=-ose + ;; + -es1800*) + os=-ose + ;; + -xenix) + os=-xenix + ;; + -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) + os=-mint + ;; + -aros*) + os=-aros + ;; + -kaos*) + os=-kaos + ;; + -zvmoe) + os=-zvmoe + ;; + -dicos*) + os=-dicos + ;; + -nacl*) + ;; + -none) + ;; + *) + # Get rid of the `-' at the beginning of $os. + os=`echo $os | sed 's/[^-]*-//'` + echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2 + exit 1 + ;; +esac +else + +# Here we handle the default operating systems that come with various machines. +# The value should be what the vendor currently ships out the door with their +# machine or put another way, the most popular os provided with the machine. + +# Note that if you're going to try to match "-MANUFACTURER" here (say, +# "-sun"), then you have to tell the case statement up towards the top +# that MANUFACTURER isn't an operating system. Otherwise, code above +# will signal an error saying that MANUFACTURER isn't an operating +# system, and we'll never get to this point. + +case $basic_machine in + score-*) + os=-elf + ;; + spu-*) + os=-elf + ;; + *-acorn) + os=-riscix1.2 + ;; + arm*-rebel) + os=-linux + ;; + arm*-semi) + os=-aout + ;; + c4x-* | tic4x-*) + os=-coff + ;; + tic54x-*) + os=-coff + ;; + tic55x-*) + os=-coff + ;; + tic6x-*) + os=-coff + ;; + # This must come before the *-dec entry. + pdp10-*) + os=-tops20 + ;; + pdp11-*) + os=-none + ;; + *-dec | vax-*) + os=-ultrix4.2 + ;; + m68*-apollo) + os=-domain + ;; + i386-sun) + os=-sunos4.0.2 + ;; + m68000-sun) + os=-sunos3 + # This also exists in the configure program, but was not the + # default. + # os=-sunos4 + ;; + m68*-cisco) + os=-aout + ;; + mep-*) + os=-elf + ;; + mips*-cisco) + os=-elf + ;; + mips*-*) + os=-elf + ;; + or32-*) + os=-coff + ;; + *-tti) # must be before sparc entry or we get the wrong os. + os=-sysv3 + ;; + sparc-* | *-sun) + os=-sunos4.1.1 + ;; + *-be) + os=-beos + ;; + *-haiku) + os=-haiku + ;; + *-ibm) + os=-aix + ;; + *-knuth) + os=-mmixware + ;; + *-wec) + os=-proelf + ;; + *-winbond) + os=-proelf + ;; + *-oki) + os=-proelf + ;; + *-hp) + os=-hpux + ;; + *-hitachi) + os=-hiux + ;; + i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent) + os=-sysv + ;; + *-cbm) + os=-amigaos + ;; + *-dg) + os=-dgux + ;; + *-dolphin) + os=-sysv3 + ;; + m68k-ccur) + os=-rtu + ;; + m88k-omron*) + os=-luna + ;; + *-next ) + os=-nextstep + ;; + *-sequent) + os=-ptx + ;; + *-crds) + os=-unos + ;; + *-ns) + os=-genix + ;; + i370-*) + os=-mvs + ;; + *-next) + os=-nextstep3 + ;; + *-gould) + os=-sysv + ;; + *-highlevel) + os=-bsd + ;; + *-encore) + os=-bsd + ;; + *-sgi) + os=-irix + ;; + *-siemens) + os=-sysv4 + ;; + *-masscomp) + os=-rtu + ;; + f30[01]-fujitsu | f700-fujitsu) + os=-uxpv + ;; + *-rom68k) + os=-coff + ;; + *-*bug) + os=-coff + ;; + *-apple) + os=-macos + ;; + *-atari*) + os=-mint + ;; + *) + os=-none + ;; +esac +fi + +# Here we handle the case where we know the os, and the CPU type, but not the +# manufacturer. We pick the logical manufacturer. +vendor=unknown +case $basic_machine in + *-unknown) + case $os in + -riscix*) + vendor=acorn + ;; + -sunos*) + vendor=sun + ;; + -cnk*|-aix*) + vendor=ibm + ;; + -beos*) + vendor=be + ;; + -hpux*) + vendor=hp + ;; + -mpeix*) + vendor=hp + ;; + -hiux*) + vendor=hitachi + ;; + -unos*) + vendor=crds + ;; + -dgux*) + vendor=dg + ;; + -luna*) + vendor=omron + ;; + -genix*) + vendor=ns + ;; + -mvs* | -opened*) + vendor=ibm + ;; + -os400*) + vendor=ibm + ;; + -ptx*) + vendor=sequent + ;; + -tpf*) + vendor=ibm + ;; + -vxsim* | -vxworks* | -windiss*) + vendor=wrs + ;; + -aux*) + vendor=apple + ;; + -hms*) + vendor=hitachi + ;; + -mpw* | -macos*) + vendor=apple + ;; + -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) + vendor=atari + ;; + -vos*) + vendor=stratus + ;; + esac + basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"` + ;; +esac + +echo $basic_machine$os +exit + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "timestamp='" +# time-stamp-format: "%:y-%02m-%02d" +# time-stamp-end: "'" +# End: -- cgit From c6fff77de9b686761f93f0e1de237f8543f5e919 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 17 Dec 2010 11:14:41 -0800 Subject: Volk: A bunch of new Orc routines plus a couple of build changes. 32fc_magnitude_16s fails test_all right now. --- volk/config/orc.m4 | 2 +- .../volk/volk_16sc_deinterleave_16s_aligned16.h | 14 ++++++++++++- .../volk/volk_16sc_deinterleave_32f_aligned16.h | 15 +++++++++++++- .../volk_16sc_deinterleave_real_8s_aligned16.h | 13 +++++++++++- .../volk/volk_16sc_magnitude_16s_aligned16.h | 6 +++--- .../volk/volk_16sc_magnitude_32f_aligned16.h | 2 +- volk/include/volk/volk_32f_max_aligned16.h | 14 +++++++++++++ volk/include/volk/volk_32f_min_aligned16.h | 14 +++++++++++++ volk/lib/qa_16sc_deinterleave_16s_aligned16.cc | 12 +++++++++++ volk/lib/qa_16sc_deinterleave_32f_aligned16.cc | 11 ++++++++++ volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc | 9 ++++++++ volk/lib/qa_16sc_magnitude_16s_aligned16.cc | 5 +++-- volk/lib/qa_16sc_magnitude_32f_aligned16.cc | 6 +++--- volk/lib/qa_32f_max_aligned16.cc | 9 ++++++++ volk/lib/qa_32f_min_aligned16.cc | 9 ++++++++ volk/lib/qa_32fc_magnitude_16s_aligned16.cc | 8 ++++---- volk/lib/qa_volk.cc | 1 - volk/orc/Makefile.am | 10 ++++++--- ...lk_16sc_deinterleave_16s_aligned16_orc_impl.orc | 5 +++++ ...lk_16sc_deinterleave_32f_aligned16_orc_impl.orc | 12 +++++++++++ ...6sc_deinterleave_real_8s_aligned16_orc_impl.orc | 6 ++++++ .../volk_16sc_magnitude_16s_aligned16_orc_impl.orc | 14 ++++++++----- .../volk_16sc_magnitude_32f_aligned16_orc_impl.orc | 8 ++++---- volk/orc/volk_32f_max_aligned16_orc_impl.orc | 5 +++++ volk/orc/volk_32f_min_aligned16_orc_impl.orc | 5 +++++ .../volk_32fc_magnitude_16s_aligned16_orc_impl.orc | 24 ++++++++-------------- .../volk_32fc_magnitude_32f_aligned16_orc_impl.orc | 22 +++++++------------- volk/volk.pc.in | 2 +- 28 files changed, 202 insertions(+), 61 deletions(-) create mode 100644 volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32f_max_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32f_min_aligned16_orc_impl.orc (limited to 'volk') diff --git a/volk/config/orc.m4 b/volk/config/orc.m4 index 9645661b0..a4653400c 100644 --- a/volk/config/orc.m4 +++ b/volk/config/orc.m4 @@ -5,7 +5,7 @@ dnl ORC_CHECK([REQUIRED_VERSION]) AC_DEFUN([ORC_CHECK], [ - ORC_REQ=ifelse([$1], , "0.4.6", [$1]) + ORC_REQ=ifelse([$1], , "0.4.10", [$1]) enable_orc = auto if test "x$enable_orc" != "xno" ; then diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h index 32e13df98..cf94a3f38 100644 --- a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h +++ b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h @@ -140,7 +140,19 @@ static inline void volk_16sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16sc_deinterleave_16s_aligned16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16sc_deinterleave_16s_aligned16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16sc_deinterleave_16s_aligned16_orc_impl(iBuffer, qBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h index 86f67437d..50b8b62d5 100644 --- a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h +++ b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h @@ -89,7 +89,20 @@ static inline void volk_16sc_deinterleave_32f_aligned16_generic(float* iBuffer, } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +extern void volk_16sc_deinterleave_32f_aligned16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16sc_deinterleave_32f_aligned16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16sc_deinterleave_32f_aligned16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h index c0d1e941a..2dd85a422 100644 --- a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h +++ b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h @@ -77,7 +77,18 @@ static inline void volk_16sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuf } #endif /* LV_HAVE_GENERIC */ - +#if LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16sc_deinterleave_real_8s_aligned16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16sc_deinterleave_real_8s_aligned16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16sc_deinterleave_real_8s_aligned16_orc_impl(iBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ #endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h index 9f3222aa6..41e8751d6 100644 --- a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h +++ b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h @@ -173,16 +173,16 @@ static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeV } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#if LV_HAVE_ORC_DISABLED /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points); +extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); static inline void volk_16sc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, num_points); + volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h index e063ae432..c2605d551 100644 --- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h +++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h @@ -161,7 +161,7 @@ static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVec } #endif /* LV_HAVE_GENERIC */ -#if LV_HAVE_ORC +#if LV_HAVE_ORC_DISABLED /*! \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector \param complexVector The vector containing the complex input values diff --git a/volk/include/volk/volk_32f_max_aligned16.h b/volk/include/volk/volk_32f_max_aligned16.h index 96aafb2bf..d4e30fba8 100644 --- a/volk/include/volk/volk_32f_max_aligned16.h +++ b/volk/include/volk/volk_32f_max_aligned16.h @@ -67,5 +67,19 @@ static inline void volk_32f_max_aligned16_generic(float* cVector, const float* a } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_max_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_max_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_max_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + #endif /* INCLUDED_VOLK_32f_MAX_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_min_aligned16.h b/volk/include/volk/volk_32f_min_aligned16.h index e247f4213..55daafb6a 100644 --- a/volk/include/volk/volk_32f_min_aligned16.h +++ b/volk/include/volk/volk_32f_min_aligned16.h @@ -67,5 +67,19 @@ static inline void volk_32f_min_aligned16_generic(float* cVector, const float* a } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_min_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_min_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_min_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + #endif /* INCLUDED_VOLK_32f_MIN_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc index e700ac72c..7e9e31df5 100644 --- a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc +++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc @@ -26,6 +26,8 @@ void qa_16sc_deinterleave_16s_aligned16::t1() { int16_t output_generic1[vlen] __attribute__ ((aligned (16))); int16_t output_sse2[vlen] __attribute__ ((aligned (16))); int16_t output_sse21[vlen] __attribute__ ((aligned (16))); + int16_t output_orc[vlen] __attribute__ ((aligned (16))); + int16_t output_orc1[vlen] __attribute__ ((aligned (16))); int16_t output_ssse3[vlen] __attribute__ ((aligned (16))); int16_t output_ssse31[vlen] __attribute__ ((aligned (16))); @@ -43,6 +45,13 @@ void qa_16sc_deinterleave_16s_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_16s_aligned16_manual(output_orc, output_orc1, input0, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_16sc_deinterleave_16s_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2"); } @@ -70,6 +79,9 @@ void qa_16sc_deinterleave_16s_aligned16::t1() { CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]); CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_ssse31[i]); + + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_orc[i]); + CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_orc1[i]); } } diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc index 6ee076998..45100206d 100644 --- a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc +++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc @@ -26,6 +26,8 @@ void qa_16sc_deinterleave_32f_aligned16::t1() { float output_generic1[vlen] __attribute__ ((aligned (16))); float output_sse2[vlen] __attribute__ ((aligned (16))); float output_sse21[vlen] __attribute__ ((aligned (16))); + float output_orc[vlen] __attribute__ ((aligned (16))); + float output_orc1[vlen] __attribute__ ((aligned (16))); int16_t* loadInput = (int16_t*)input0; for(int i = 0; i < vlen*2; ++i) { @@ -41,6 +43,13 @@ void qa_16sc_deinterleave_32f_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_32f_aligned16_manual(output_orc, output_orc1, input0, 32768.0, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_16sc_deinterleave_32f_aligned16_manual(output_sse2, output_sse21, input0, 32768.0, vlen, "sse"); } @@ -57,6 +66,8 @@ void qa_16sc_deinterleave_32f_aligned16::t1() { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse21[i], fabs(output_generic1[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_orc1[i], fabs(output_generic1[i])*1e-4); } } diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc index 5ab458bc9..d187d20c3 100644 --- a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc +++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc @@ -24,6 +24,7 @@ void qa_16sc_deinterleave_real_8s_aligned16::t1() { int8_t output_generic[vlen] __attribute__ ((aligned (16))); int8_t output_ssse3[vlen] __attribute__ ((aligned (16))); + int8_t output_orc[vlen] __attribute__ ((aligned (16))); int16_t* loadInput = (int16_t*)input0; for(int i = 0; i < vlen*2; ++i) { @@ -39,6 +40,13 @@ void qa_16sc_deinterleave_real_8s_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_16sc_deinterleave_real_8s_aligned16_manual(output_orc, input0, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_16sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3"); } @@ -54,6 +62,7 @@ void qa_16sc_deinterleave_real_8s_aligned16::t1() { for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]); + CPPUNIT_ASSERT_EQUAL(output_generic[i], output_orc[i]); } } diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc index c8f13ff84..dd4ae75ff 100644 --- a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc +++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc @@ -40,13 +40,14 @@ void qa_16sc_magnitude_16s_aligned16::t1() { end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); - start = clock(); +/* start = clock(); for(int count = 0; count < ITERS; ++count) { volk_16sc_magnitude_16s_aligned16_manual(output_orc, input0, vlen, "orc"); } end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("orc_time: %f\n", total); +*/ start = clock(); for(int count = 0; count < ITERS; ++count) { volk_16sc_magnitude_16s_aligned16_manual(output_sse, input0, vlen, "sse"); @@ -72,7 +73,7 @@ void qa_16sc_magnitude_16s_aligned16::t1() { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], 1.1); + //CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], 1.1); } } diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc index e7178863c..53d42e28c 100644 --- a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc +++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc @@ -90,14 +90,14 @@ void qa_16sc_magnitude_32f_aligned16::t1() { end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); - start = clock(); +/* start = clock(); for(int count = 0; count < ITERS; ++count) { volk_16sc_magnitude_32f_aligned16_manual(output_orc, input0, 32768.0, vlen, "orc"); } end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("orc_time: %f\n", total); - +*/ start = clock(); for(int count = 0; count < ITERS; ++count) { volk_16sc_magnitude_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); @@ -123,7 +123,7 @@ void qa_16sc_magnitude_32f_aligned16::t1() { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4); +// CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4); } } diff --git a/volk/lib/qa_32f_max_aligned16.cc b/volk/lib/qa_32f_max_aligned16.cc index 3ef375176..cb1fd3627 100644 --- a/volk/lib/qa_32f_max_aligned16.cc +++ b/volk/lib/qa_32f_max_aligned16.cc @@ -25,6 +25,7 @@ void qa_32f_max_aligned16::t1() { float output0[vlen] __attribute__ ((aligned (16))); float output01[vlen] __attribute__ ((aligned (16))); + float output02[vlen] __attribute__ ((aligned (16))); for(int i = 0; i < vlen; ++i) { input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); @@ -40,6 +41,13 @@ void qa_32f_max_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_max_aligned16_manual(output02, input0, input1, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_32f_max_aligned16_manual(output01, input0, input1, vlen, "sse"); } @@ -54,6 +62,7 @@ void qa_32f_max_aligned16::t1() { for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); } } diff --git a/volk/lib/qa_32f_min_aligned16.cc b/volk/lib/qa_32f_min_aligned16.cc index 617e18b24..bf453f360 100644 --- a/volk/lib/qa_32f_min_aligned16.cc +++ b/volk/lib/qa_32f_min_aligned16.cc @@ -25,6 +25,7 @@ void qa_32f_min_aligned16::t1() { float output0[vlen] __attribute__ ((aligned (16))); float output01[vlen] __attribute__ ((aligned (16))); + float output02[vlen] __attribute__ ((aligned (16))); for(int i = 0; i < vlen; ++i) { input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); @@ -40,6 +41,13 @@ void qa_32f_min_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_min_aligned16_manual(output02, input0, input1, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_32f_min_aligned16_manual(output01, input0, input1, vlen, "sse"); } @@ -54,6 +62,7 @@ void qa_32f_min_aligned16::t1() { for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); } } diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc index c3e65866b..105d32d0c 100644 --- a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc +++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc @@ -63,10 +63,10 @@ void qa_32fc_magnitude_16s_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse3_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } + //for(int i = 0; i < 10; ++i) { + // printf("inputs: %f, %f\n", input0[i].real(), input0[i].imag()); + // printf("generic... %i, sse3... %i, orc... %i\n", output_generic[i], output_sse3[i], output_orc[i]); + //} for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); diff --git a/volk/lib/qa_volk.cc b/volk/lib/qa_volk.cc index c3c27b69b..f6a334da7 100644 --- a/volk/lib/qa_volk.cc +++ b/volk/lib/qa_volk.cc @@ -118,7 +118,6 @@ CppUnit::TestSuite * qa_volk::suite() { CppUnit::TestSuite *s = new CppUnit::TestSuite("volk"); - s->addTest(qa_16s_quad_max_star_aligned16::suite()); s->addTest(qa_32fc_dot_prod_aligned16::suite()); s->addTest(qa_32fc_square_dist_scalar_mult_aligned16::suite()); diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am index 3f105fd80..797efee18 100644 --- a/volk/orc/Makefile.am +++ b/volk/orc/Makefile.am @@ -35,13 +35,17 @@ volk_32f_subtract_aligned16_orc_impl.orc \ volk_32f_divide_aligned16_orc_impl.orc \ volk_32f_multiply_aligned16_orc_impl.orc \ volk_32f_sqrt_aligned16_orc_impl.orc \ -volk_16sc_magnitude_32f_aligned16_orc_impl.orc \ +volk_32f_max_aligned16_orc_impl.orc \ +volk_32f_min_aligned16_orc_impl.orc \ volk_32fc_magnitude_32f_aligned16_orc_impl.orc \ -volk_32fc_magnitude_16s_aligned16_orc_impl.orc +volk_32fc_magnitude_16s_aligned16_orc_impl.orc \ +volk_16sc_deinterleave_16s_aligned16_orc_impl.orc \ +volk_16sc_deinterleave_32f_aligned16_orc_impl.orc \ +volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc -my_ORCC_FLAGS = --implementation --lazy-init $(ORCC_FLAGS) +my_ORCC_FLAGS = --implementation $(ORCC_FLAGS) .orc.c: $(ORCC) $(my_ORCC_FLAGS) -o $@ $< diff --git a/volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc new file mode 100644 index 000000000..d226064a7 --- /dev/null +++ b/volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_16sc_deinterleave_16s_aligned16_orc_impl +.dest 2 idst +.dest 2 qdst +.source 4 src +splitlw qdst, idst, src diff --git a/volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc new file mode 100644 index 000000000..dddf682ca --- /dev/null +++ b/volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc @@ -0,0 +1,12 @@ +.function volk_16sc_deinterleave_32f_aligned16_orc_impl +.dest 4 idst +.dest 4 qdst +.source 4 src +.floatparam 4 scalar +.temp 8 iql +.temp 8 iqf + +x2 convswl iql, src +x2 convlf iqf, iql +x2 divf iqf, iqf, scalar +splitql qdst, idst, iqf diff --git a/volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc new file mode 100644 index 000000000..609750096 --- /dev/null +++ b/volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc @@ -0,0 +1,6 @@ +.function volk_16sc_deinterleave_real_8s_aligned16_orc_impl +.dest 1 dst +.source 4 src +.temp 2 iw +select0lw iw, src +convhwb dst, iw diff --git a/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc index f6c959c00..83b867dca 100644 --- a/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc +++ b/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc @@ -1,6 +1,7 @@ .function volk_16sc_magnitude_16s_aligned16_orc_impl .source 4 src .dest 2 dst +.floatparam 4 scalar .temp 2 reals .temp 2 imags .temp 4 reall @@ -12,13 +13,16 @@ .temp 4 rootl splitlw reals, imags, src -convwl reall, reals -convwl imagl, imags +convswl reall, reals +convswl imagl, imags convlf realf, reall convlf imagf, imagl -mulf realf, realf, (1.0 / 32768.0) -mulf imagf, imagf, (1.0 / 32768.0) +divf realf, realf, scalar +divf imagf, imagf, scalar +mulf realf, realf, realf +mulf imagf, imagf, imagf addf sumf, realf, imagf sqrtf rootf, sumf +mulf rootf, rootf, scalar convfl rootl, rootf -conflw dst, rootl +convlw dst, rootl diff --git a/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc index 44654ad8e..6d2ed8197 100644 --- a/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc +++ b/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc @@ -2,7 +2,6 @@ .source 4 src .dest 4 dst .floatparam 4 scalar -.temp 4 invscalar .temp 4 reall .temp 4 imagl .temp 2 reals @@ -11,14 +10,15 @@ .temp 4 imagf .temp 4 sumf -divf invscalar, 1.0, scalar + + splitlw reals, imags, src convswl reall, reals convswl imagl, imags convlf realf, reall convlf imagf, imagl -mulf realf, realf, invscalar -mulf imagf, imagf, invscalar +divf realf, realf, scalar +divf imagf, imagf, scalar mulf realf, realf, realf mulf imagf, imagf, imagf addf sumf, realf, imagf diff --git a/volk/orc/volk_32f_max_aligned16_orc_impl.orc b/volk/orc/volk_32f_max_aligned16_orc_impl.orc new file mode 100644 index 000000000..97f48ba4a --- /dev/null +++ b/volk/orc/volk_32f_max_aligned16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32f_max_aligned16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +maxf dst, src1, src2 diff --git a/volk/orc/volk_32f_min_aligned16_orc_impl.orc b/volk/orc/volk_32f_min_aligned16_orc_impl.orc new file mode 100644 index 000000000..a597933de --- /dev/null +++ b/volk/orc/volk_32f_min_aligned16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32f_min_aligned16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +minf dst, src1, src2 diff --git a/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc index db8405e59..f71dd9a37 100644 --- a/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc +++ b/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc @@ -2,24 +2,18 @@ .source 8 src .dest 2 dst .floatparam 4 scalar -.temp 4 invscalar -.temp 4 reall -.temp 4 imagl -.temp 4 realf -.temp 4 imagf +.temp 8 iqf +.temp 8 prodiqf +.temp 4 qf +.temp 4 if .temp 4 sumf .temp 4 rootf .temp 4 rootl -divf invscalar, 1.0, scalar -splitql reall, imagl, src -convlf realf, reall -convlf imagf, imagl -mulf realf, realf, invscalar -mulf imagf, imagf, invscalar -mulf realf, realf, realf -mulf imagf, imagf, imagf -addf sumf, realf, imagf +x2 mulf prodiqf, src, src +splitql qf, if, prodiqf +addf sumf, if, qf sqrtf rootf, sumf +mulf rootf, rootf, scalar convfl rootl, rootf -convlw dst, rootl +convssslw dst, rootl diff --git a/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc index cc5c85b45..47a10531d 100644 --- a/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc +++ b/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc @@ -1,21 +1,13 @@ .function volk_32fc_magnitude_32f_aligned16_orc_impl .source 8 src .dest 4 dst -.floatparam 4 scalar -.temp 4 invscalar -.temp 4 reall -.temp 4 imagl -.temp 4 realf -.temp 4 imagf +.temp 8 iqf +.temp 8 prodiqf +.temp 4 qf +.temp 4 if .temp 4 sumf -divf invscalar, 1.0, scalar -splitql reall, imagl, src -convlf realf, reall -convlf imagf, imagl -mulf realf, realf, invscalar -mulf imagf, imagf, invscalar -mulf realf, realf, realf -mulf imagf, imagf, imagf -addf sumf, realf, imagf +x2 mulf prodiqf, src, src +splitql qf, if, prodiqf +addf sumf, if, qf sqrtf dst, sumf diff --git a/volk/volk.pc.in b/volk/volk.pc.in index a24298856..b03dbdada 100644 --- a/volk/volk.pc.in +++ b/volk/volk.pc.in @@ -10,6 +10,6 @@ Name: volk Description: VOLK.. Vector Optimized Library of Kernels Requires: Version: @VERSION@ -Libs: -lvolk -lvolk_runtime +Libs: -lvolk -lvolk_runtime -lvolk_orc Cflags: -I${includedir} ${LV_CXXFLAGS} -- cgit From 200720da362e30f74083aad4dc106e4a057638bf Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 17 Dec 2010 12:20:16 -0800 Subject: Volk: Magnitude functions. 32fc_magnitude_16s currently clips to +MAX instead of -MAX. --- volk/lib/qa_16sc_magnitude_16s_aligned16.cc | 6 ++--- volk/lib/qa_32fc_magnitude_16s_aligned16.cc | 8 +++---- volk/orc/Makefile.am | 1 + .../volk_16sc_magnitude_16s_aligned16_orc_impl.orc | 27 +++++++++------------- .../volk_32fc_magnitude_16s_aligned16_orc_impl.orc | 2 +- 5 files changed, 20 insertions(+), 24 deletions(-) (limited to 'volk') diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc index dd4ae75ff..d00315b57 100644 --- a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc +++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc @@ -40,14 +40,14 @@ void qa_16sc_magnitude_16s_aligned16::t1() { end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("generic_time: %f\n", total); -/* start = clock(); + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_16sc_magnitude_16s_aligned16_manual(output_orc, input0, vlen, "orc"); } end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("orc_time: %f\n", total); -*/ + start = clock(); for(int count = 0; count < ITERS; ++count) { volk_16sc_magnitude_16s_aligned16_manual(output_sse, input0, vlen, "sse"); @@ -73,7 +73,7 @@ void qa_16sc_magnitude_16s_aligned16::t1() { //printf("%d...%d\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1); CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1); - //CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], 1.1); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], 1.1); } } diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc index 105d32d0c..53b3bf790 100644 --- a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc +++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc @@ -63,10 +63,10 @@ void qa_32fc_magnitude_16s_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse3_time: %f\n", total); - //for(int i = 0; i < 10; ++i) { - // printf("inputs: %f, %f\n", input0[i].real(), input0[i].imag()); - // printf("generic... %i, sse3... %i, orc... %i\n", output_generic[i], output_sse3[i], output_orc[i]); - //} + for(int i = 0; i < 10; ++i) { + printf("inputs: %f, %f\n", input0[i].real(), input0[i].imag()); + printf("generic... %i, sse3... %i, orc... %i\n", output_generic[i], output_sse3[i], output_orc[i]); + } for(int i = 0; i < vlen; ++i) { //printf("%d...%d\n", output0[i], output01[i]); diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am index 797efee18..a469638c1 100644 --- a/volk/orc/Makefile.am +++ b/volk/orc/Makefile.am @@ -39,6 +39,7 @@ volk_32f_max_aligned16_orc_impl.orc \ volk_32f_min_aligned16_orc_impl.orc \ volk_32fc_magnitude_32f_aligned16_orc_impl.orc \ volk_32fc_magnitude_16s_aligned16_orc_impl.orc \ +volk_16sc_magnitude_16s_aligned16_orc_impl.orc \ volk_16sc_deinterleave_16s_aligned16_orc_impl.orc \ volk_16sc_deinterleave_32f_aligned16_orc_impl.orc \ volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc diff --git a/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc index 83b867dca..088f56312 100644 --- a/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc +++ b/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc @@ -2,26 +2,21 @@ .source 4 src .dest 2 dst .floatparam 4 scalar -.temp 2 reals -.temp 2 imags -.temp 4 reall -.temp 4 imagl -.temp 4 realf -.temp 4 imagf +.temp 8 iql +.temp 8 iqf +.temp 8 prodiqf +.temp 4 qf +.temp 4 if .temp 4 sumf .temp 4 rootf .temp 4 rootl -splitlw reals, imags, src -convswl reall, reals -convswl imagl, imags -convlf realf, reall -convlf imagf, imagl -divf realf, realf, scalar -divf imagf, imagf, scalar -mulf realf, realf, realf -mulf imagf, imagf, imagf -addf sumf, realf, imagf +x2 convswl iql, src +x2 convlf iqf, iql +x2 divf iqf, iqf, scalar +x2 mulf prodiqf, iqf, iqf +splitql qf, if, prodiqf +addf sumf, if, qf sqrtf rootf, sumf mulf rootf, rootf, scalar convfl rootl, rootf diff --git a/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc index f71dd9a37..48b831021 100644 --- a/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc +++ b/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc @@ -16,4 +16,4 @@ addf sumf, if, qf sqrtf rootf, sumf mulf rootf, rootf, scalar convfl rootl, rootf -convssslw dst, rootl +convlw dst, rootl -- cgit From 0e92b93f21fc9c324c379bc318120d414e7422cc Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 17 Dec 2010 13:35:40 -0800 Subject: Volk: Orc impl for 32fc_magnitude_16s saturates at -max instead of +max. --- volk/lib/qa_32fc_magnitude_16s_aligned16.cc | 6 +++--- volk/lib/qa_volk.cc | 1 + volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc | 6 +++++- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'volk') diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc index 53b3bf790..93d4ec150 100644 --- a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc +++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc @@ -63,9 +63,9 @@ void qa_32fc_magnitude_16s_aligned16::t1() { total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse3_time: %f\n", total); - for(int i = 0; i < 10; ++i) { - printf("inputs: %f, %f\n", input0[i].real(), input0[i].imag()); - printf("generic... %i, sse3... %i, orc... %i\n", output_generic[i], output_sse3[i], output_orc[i]); + for(int i = 0; i < 1; ++i) { + // printf("inputs: %f, %f\n", input0[i].real(), input0[i].imag()); + // printf("generic... %i, sse3... %i, orc... %i\n", output_generic[i], output_sse3[i], output_orc[i]); } for(int i = 0; i < vlen; ++i) { diff --git a/volk/lib/qa_volk.cc b/volk/lib/qa_volk.cc index f6a334da7..c3c27b69b 100644 --- a/volk/lib/qa_volk.cc +++ b/volk/lib/qa_volk.cc @@ -118,6 +118,7 @@ CppUnit::TestSuite * qa_volk::suite() { CppUnit::TestSuite *s = new CppUnit::TestSuite("volk"); + s->addTest(qa_16s_quad_max_star_aligned16::suite()); s->addTest(qa_32fc_dot_prod_aligned16::suite()); s->addTest(qa_32fc_square_dist_scalar_mult_aligned16::suite()); diff --git a/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc index 48b831021..15f8fdff0 100644 --- a/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc +++ b/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc @@ -9,11 +9,15 @@ .temp 4 sumf .temp 4 rootf .temp 4 rootl +.temp 4 maskl x2 mulf prodiqf, src, src splitql qf, if, prodiqf addf sumf, if, qf sqrtf rootf, sumf mulf rootf, rootf, scalar +cmpltf maskl, scalar, rootf +andl maskl, maskl, 0x80000000 +orl rootf, rootf, maskl convfl rootl, rootf -convlw dst, rootl +convssslw dst, rootl -- cgit From ed78ba5d9999bbe50507373a1aa2877ef0da64c6 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 17 Dec 2010 22:58:40 -0600 Subject: volk: Fix for popcnt's 64/32-bit issues. --- volk/include/volk/volk_64u_popcnt_aligned16.h | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) (limited to 'volk') diff --git a/volk/include/volk/volk_64u_popcnt_aligned16.h b/volk/include/volk/volk_64u_popcnt_aligned16.h index e8e1396e7..4d62f9375 100644 --- a/volk/include/volk/volk_64u_popcnt_aligned16.h +++ b/volk/include/volk/volk_64u_popcnt_aligned16.h @@ -36,37 +36,13 @@ static inline void volk_64u_popcnt_aligned16_generic(uint64_t* ret, const uint64 #endif /*LV_HAVE_GENERIC*/ -#if LV_HAVE_SSE4_2 +#if LV_HAVE_SSE4_2 && LV_HAVE_64 #include static inline void volk_64u_popcnt_aligned16_sse4_2(uint64_t* ret, const uint64_t value) { -#if LV_64 *ret = _mm_popcnt_u64(value); -#else - const uint32_t* valueVector = (const uint32_t*)&value; - - // This is faster than a lookup table - uint32_t retVal = valueVector[0]; - - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - uint64_t retVal64 = retVal; - - retVal = valueVector[1]; - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - retVal64 += retVal; - - *ret = retVal64; -#endif } #endif /*LV_HAVE_SSE4_2*/ -- cgit From 79c514b542d25e709903b41cfdc1673aae35ac1d Mon Sep 17 00:00:00 2001 From: Eric Blossom Date: Thu, 23 Dec 2010 14:29:56 -0800 Subject: Update volk .gitignores --- volk/include/volk/.gitignore | 1 + volk/lib/.gitignore | 1 + 2 files changed, 2 insertions(+) (limited to 'volk') diff --git a/volk/include/volk/.gitignore b/volk/include/volk/.gitignore index ae4824cc3..be8358f3a 100644 --- a/volk/include/volk/.gitignore +++ b/volk/include/volk/.gitignore @@ -17,3 +17,4 @@ /volk_runtime.h /volk_tables.h /volk_typedefs.h +/volk_mktables diff --git a/volk/lib/.gitignore b/volk/lib/.gitignore index 573fb1618..0f17543ab 100644 --- a/volk/lib/.gitignore +++ b/volk/lib/.gitignore @@ -19,3 +19,4 @@ /volk_mktables.c /volk_proccpu_sim.c /volk_runtime.c +/test_all -- cgit From 5b45b875ed58fd66234764a05da42c6eaff22c4d Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 11 Jan 2011 15:17:55 -0800 Subject: Volk: Added more Orc routines (including complex multiply). Started redoing the testing framework so it's easier to add new archs to tests. --- volk/include/volk/volk_32f_normalize_aligned16.h | 15 ++++ .../volk/volk_32fc_32f_multiply_aligned16.h | 13 ++++ volk/include/volk/volk_32fc_multiply_aligned16.h | 17 +++++ volk/lib/Makefile.am | 2 + volk/lib/qa_32f_normalize_aligned16.cc | 13 ++++ volk/lib/qa_32fc_32f_multiply_aligned16.cc | 84 ++++++++++------------ volk/lib/qa_32fc_multiply_aligned16.cc | 12 ++++ volk/orc/Makefile.am | 3 + 8 files changed, 112 insertions(+), 47 deletions(-) (limited to 'volk') diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h index 1aabb1d9d..27fb5f7fa 100644 --- a/volk/include/volk/volk_32f_normalize_aligned16.h +++ b/volk/include/volk/volk_32f_normalize_aligned16.h @@ -60,6 +60,21 @@ static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC +/*! + \brief Normalizes the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be normalizeed + \param bVector One of the vectors to be normalizeed + \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector +*/ +extern void volk_32f_normalize_aligned16_orc_impl(float* vecBuffer, const float scalar, unsigned int num_points); +static inline void volk_32f_normalize_aligned16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ + float invscalar = 1.0 / scalar; + volk_32f_normalize_aligned16_orc_impl(vecBuffer, invscalar, num_points); +} +#endif /* LV_HAVE_GENERIC */ + diff --git a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h index 436656ca0..304ed8e2d 100644 --- a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h +++ b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h @@ -76,6 +76,19 @@ static inline void volk_32fc_32f_multiply_aligned16_generic(lv_32fc_t* cVector, } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC + /*! + \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_32f_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32fc_32f_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + volk_32fc_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_GENERIC */ diff --git a/volk/include/volk/volk_32fc_multiply_aligned16.h b/volk/include/volk/volk_32fc_multiply_aligned16.h index 6a1649fdb..c8f2418c3 100644 --- a/volk/include/volk/volk_32fc_multiply_aligned16.h +++ b/volk/include/volk/volk_32fc_multiply_aligned16.h @@ -4,6 +4,7 @@ #include #include #include +#include #if LV_HAVE_SSE3 #include @@ -72,6 +73,22 @@ static inline void volk_32fc_multiply_aligned16_generic(lv_32fc_t* cVector, cons } #endif /* LV_HAVE_GENERIC */ +#if LV_HAVE_ORC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points); +static inline void volk_32fc_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + static const float mask = -0.0; + volk_32fc_multiply_aligned16_orc_impl(cVector, aVector, bVector, mask, num_points); +} +#endif /* LV_HAVE_ORC */ + + diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 253033461..0aeafe4aa 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -156,6 +156,7 @@ endif # ---------------------------------------------------------------- libvolk_qa_la_SOURCES = \ qa_volk.cc \ + qa_utils.cc \ qa_16s_quad_max_star_aligned16.cc \ qa_32fc_dot_prod_aligned16.cc \ qa_32fc_square_dist_aligned16.cc \ @@ -257,6 +258,7 @@ libvolk_qa_la_LIBADD = \ noinst_HEADERS = \ volk_init.h \ qa_volk.h \ + qa_utils.h \ assembly.h \ qa_16s_quad_max_star_aligned16.h \ qa_32fc_dot_prod_aligned16.h \ diff --git a/volk/lib/qa_32f_normalize_aligned16.cc b/volk/lib/qa_32f_normalize_aligned16.cc index 1c7b485a6..0da43ecff 100644 --- a/volk/lib/qa_32f_normalize_aligned16.cc +++ b/volk/lib/qa_32f_normalize_aligned16.cc @@ -26,13 +26,16 @@ void qa_32f_normalize_aligned16::t1() { float* output0; float* output01; + float* output02; ret = posix_memalign((void**)&output0, 16, vlen*sizeof(float)); ret = posix_memalign((void**)&output01, 16, vlen*sizeof(float)); + ret = posix_memalign((void**)&output02, 16, vlen*sizeof(float)); for(int i = 0; i < vlen; ++i) { output0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); } memcpy(output01, output0, vlen*sizeof(float)); + memcpy(output02, output0, vlen*sizeof(float)); printf("32f_normalize_aligned\n"); start = clock(); @@ -49,6 +52,14 @@ void qa_32f_normalize_aligned16::t1() { end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32f_normalize_aligned16_manual(output02, 1.15, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); + for(int i = 0; i < 1; ++i) { //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); @@ -57,10 +68,12 @@ void qa_32f_normalize_aligned16::t1() { for(int i = 0; i < vlen; ++i) { // printf("%e...%e\n", output0[i], output01[i]); CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output02[i], fabs(output0[i])*1e-4); } free(output0); free(output01); + free(output02); } #endif diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.cc b/volk/lib/qa_32fc_32f_multiply_aligned16.cc index 4eba0a3cd..7bb8d21c1 100644 --- a/volk/lib/qa_32fc_32f_multiply_aligned16.cc +++ b/volk/lib/qa_32fc_32f_multiply_aligned16.cc @@ -2,28 +2,12 @@ #include #include #include -#include #include - -#define assertcomplexEqual(expected, actual, delta) \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); +#include +#include #define ERR_DELTA (1e-4) -//test for sse -static float uniform() { - return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - for (unsigned i = 0; i < n; i++) - buf[i] = uniform (); -} - -#ifdef LV_HAVE_SSE3 void qa_32fc_32f_multiply_aligned16::t1() { const int vlen = 2046; @@ -36,50 +20,56 @@ void qa_32fc_32f_multiply_aligned16::t1() { std::complex* input; float * taps; int i; + std::vector archs; + archs.push_back("generic"); +#ifdef LV_HAVE_SSE3 + archs.push_back("sse3"); +#endif +#ifdef LV_HAVE_ORC + archs.push_back("orc"); +#endif - std::complex* result_generic; - std::complex* result_sse3; + std::vector* > results; ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float)); ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float)); - ret = posix_memalign((void**)&result_generic, 16, vlen * 2 * sizeof(float)); - ret = posix_memalign((void**)&result_sse3, 16, vlen * 2 * sizeof(float)); + + for(i=0; i < archs.size(); i++) { + std::complex *ptr; + ret = posix_memalign((void**)&ptr, 16, vlen * 2 * sizeof(float)); + if(ret) { + printf("Couldn't allocate memory\n"); + exit(1); + } + results.push_back(ptr); + } random_floats((float*)input, vlen * 2); random_floats(taps, vlen); printf("32fc_32f_multiply_aligned16\n"); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_32f_multiply_aligned16_manual(result_generic, input, taps, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_32f_multiply_aligned16_manual(result_sse3, input, taps, vlen, "sse3"); + for(i=0; i < archs.size(); i++) { + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_32f_multiply_aligned16_manual(results[i], input, taps, vlen, archs[i].c_str()); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("%s_time: %f\n", archs[i].c_str(), total); } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3_time: %f\n", total); - for(i = 0; i < vlen; i++){ - assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA); + for(i=0; i < vlen; i++) { + int j = 1; + for(j; j < archs.size(); j++) { + assertcomplexEqual(results[0][i], results[j][i], ERR_DELTA); + } } free(input); free(taps); - free(result_generic); - free(result_sse3); - -} -#else -void qa_32fc_32f_multiply_aligned16::t1() { - printf("sse3 not available... no test performed\n"); + for(i=0; i < archs.size(); i++) { + free(results[i]); + } } -#endif /* LV_HAVE_SSE3 */ - diff --git a/volk/lib/qa_32fc_multiply_aligned16.cc b/volk/lib/qa_32fc_multiply_aligned16.cc index e1f7eab3d..022b58ad6 100644 --- a/volk/lib/qa_32fc_multiply_aligned16.cc +++ b/volk/lib/qa_32fc_multiply_aligned16.cc @@ -41,11 +41,13 @@ void qa_32fc_multiply_aligned16::t1() { std::complex* result_generic; std::complex* result_sse3; + std::complex* result_orc; ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(float)); ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(float)); ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float)); ret = posix_memalign((void**)&result_sse3, 16, vlen*2*sizeof(float)); + ret = posix_memalign((void**)&result_orc, 16, vlen*2*sizeof(float)); random_floats((float*)input, vlen * 2); random_floats((float*)taps, vlen * 2); @@ -67,15 +69,25 @@ void qa_32fc_multiply_aligned16::t1() { end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; printf("sse3_time: %f\n", total); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_32fc_multiply_aligned16_manual(result_orc, input, taps, vlen, "orc"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("orc_time: %f\n", total); for(i = 0; i < vlen; i++){ assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA); + assertcomplexEqual(result_generic[i], result_orc[i], ERR_DELTA); } free(input); free(taps); free(result_generic); free(result_sse3); + free(result_orc); } #else diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am index a469638c1..066050a7c 100644 --- a/volk/orc/Makefile.am +++ b/volk/orc/Makefile.am @@ -34,9 +34,12 @@ volk_32f_add_aligned16_orc_impl.orc \ volk_32f_subtract_aligned16_orc_impl.orc \ volk_32f_divide_aligned16_orc_impl.orc \ volk_32f_multiply_aligned16_orc_impl.orc \ +volk_32fc_multiply_aligned16_orc_impl.orc \ +volk_32fc_32f_multiply_aligned16_orc_impl.orc \ volk_32f_sqrt_aligned16_orc_impl.orc \ volk_32f_max_aligned16_orc_impl.orc \ volk_32f_min_aligned16_orc_impl.orc \ +volk_32f_normalize_aligned16_orc_impl.orc \ volk_32fc_magnitude_32f_aligned16_orc_impl.orc \ volk_32fc_magnitude_16s_aligned16_orc_impl.orc \ volk_16sc_magnitude_16s_aligned16_orc_impl.orc \ -- cgit From c501dc110d3cc7cfcfff178fecb21f30ac9bd54c Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 11 Jan 2011 15:35:04 -0800 Subject: Volk: fixed normalize. --- volk/include/volk/volk_32f_normalize_aligned16.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'volk') diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h index 27fb5f7fa..323d0513c 100644 --- a/volk/include/volk/volk_32f_normalize_aligned16.h +++ b/volk/include/volk/volk_32f_normalize_aligned16.h @@ -68,10 +68,10 @@ static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const \param bVector One of the vectors to be normalizeed \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector */ -extern void volk_32f_normalize_aligned16_orc_impl(float* vecBuffer, const float scalar, unsigned int num_points); +extern void volk_32f_normalize_aligned16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); static inline void volk_32f_normalize_aligned16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ float invscalar = 1.0 / scalar; - volk_32f_normalize_aligned16_orc_impl(vecBuffer, invscalar, num_points); + volk_32f_normalize_aligned16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); } #endif /* LV_HAVE_GENERIC */ -- cgit From c77bb3e71562daa68e9a195a0131b7cc04324784 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 12 Jan 2011 19:20:35 -0800 Subject: Volk: Working on a new QA architecture that doesn't require individual test programs. --- volk/lib/Makefile.am | 2 - volk/lib/qa_32fc_32f_multiply_aligned16.cc | 6 +- volk/lib/qa_8sc_deinterleave_16s_aligned16.cc | 2 +- volk/lib/qa_utils.cc | 223 ++++++++++++++++++++++++++ volk/lib/qa_utils.h | 19 +++ volk/lib/qa_volk.cc | 2 +- 6 files changed, 247 insertions(+), 7 deletions(-) create mode 100644 volk/lib/qa_utils.cc create mode 100644 volk/lib/qa_utils.h (limited to 'volk') diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 0aeafe4aa..a10b0a362 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -156,7 +156,6 @@ endif # ---------------------------------------------------------------- libvolk_qa_la_SOURCES = \ qa_volk.cc \ - qa_utils.cc \ qa_16s_quad_max_star_aligned16.cc \ qa_32fc_dot_prod_aligned16.cc \ qa_32fc_square_dist_aligned16.cc \ @@ -181,7 +180,6 @@ libvolk_qa_la_SOURCES = \ qa_32f_dot_prod_aligned16.cc \ qa_32f_dot_prod_unaligned16.cc \ qa_32f_fm_detect_aligned16.cc \ - qa_32fc_32f_multiply_aligned16.cc \ qa_32fc_multiply_aligned16.cc \ qa_32f_divide_aligned16.cc \ qa_32f_multiply_aligned16.cc \ diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.cc b/volk/lib/qa_32fc_32f_multiply_aligned16.cc index 7bb8d21c1..b80e0e008 100644 --- a/volk/lib/qa_32fc_32f_multiply_aligned16.cc +++ b/volk/lib/qa_32fc_32f_multiply_aligned16.cc @@ -5,10 +5,11 @@ #include #include #include +#include -#define ERR_DELTA (1e-4) +#define TOLERANCE (1e-4) -void qa_32fc_32f_multiply_aligned16::t1() { +void qa_32fc_32f_multiply_aligned16(void) { const int vlen = 2046; const int ITERS = 100000; @@ -72,4 +73,3 @@ void qa_32fc_32f_multiply_aligned16::t1() { free(results[i]); } } - diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc index 94e63e37d..f753e1107 100644 --- a/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc +++ b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc @@ -40,7 +40,7 @@ void qa_8sc_deinterleave_16s_aligned16::t1() { start = clock(); for(int count = 0; count < ITERS; ++count) { - volk_8sc_deinterleave_16s_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); + volk_8sc_deinterleave_16s_aligned16_manual(output_generic, output_generic1, input0, vlen, "monkeys"); } end = clock(); total = (double)(end-start)/(double)CLOCKS_PER_SEC; diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc new file mode 100644 index 000000000..4d93ca62a --- /dev/null +++ b/volk/lib/qa_utils.cc @@ -0,0 +1,223 @@ +#include "qa_utils.h" +#include +#include +#include +#include +#include +#include +#include +#include +//#include +//#include +#include +#include +#include +#include +//#include + +float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +template +t *make_aligned_buffer(unsigned int len) { + t *buf; + int ret; + ret = posix_memalign((void**)&buf, 16, len * sizeof(t)); + assert(ret == 0); + return buf; +} + +void make_buffer_for_signature(std::vector &buffs, std::vector inputsig, unsigned int vlen) { + BOOST_FOREACH(std::string sig, inputsig) { + if (sig=="32fc" || sig=="64f") buffs.push_back((void *) make_aligned_buffer(vlen)); + else if(sig=="32f" || sig=="32u" || sig=="32s" || sig=="16sc") buffs.push_back((void *) make_aligned_buffer(vlen)); + else if(sig=="16s" || sig=="16u") buffs.push_back((void *) make_aligned_buffer(vlen)); + else if(sig=="8s" || sig=="8u") buffs.push_back((void *) make_aligned_buffer(vlen)); + else std::cout << "Invalid type!" << std::endl; + } +} + +static std::vector get_arch_list(const int archs[]) { + std::vector archlist; + int num_archs = archs[0]; + + //there has got to be a way to query these arches + for(int i = 0; i < num_archs; i++) { + switch(archs[i+1]) { + case (1< valid_types = boost::assign::list_of("32fc")("32f")("32s")("32u")("16sc")("16s")("16u")("8s")("8u"); + + BOOST_FOREACH(std::string this_type, valid_types) { + if(type == this_type) return true; + } + return false; +} + + +static void get_function_signature(std::vector &inputsig, + std::vector &outputsig, + std::string name) { + boost::char_separator sep("_"); + boost::tokenizer > tok(name, sep); + std::vector toked; + tok.assign(name); + toked.assign(tok.begin(), tok.end()); + + assert(toked[0] == "volk"); + + inputsig.push_back(toked[1]); //mandatory + int pos = 2; + bool valid_type = true; + while(valid_type && pos < toked.size()) { + if(is_valid_type(toked[pos])) inputsig.push_back(toked[pos]); + else valid_type = false; + pos++; + } + while(!valid_type && pos < toked.size()) { + if(is_valid_type(toked[pos])) valid_type = true; + pos++; + } + while(valid_type && pos < toked.size()) { + if(is_valid_type(toked[pos])) outputsig.push_back(toked[pos]); + else valid_type = false; + pos++; + } + + //if there's no explicit output sig then assume the output is the same as the first input + if(outputsig.size() == 0) outputsig.push_back(inputsig[0]); + assert(inputsig.size() != 0); + assert(outputsig.size() != 0); +} + +inline void run_cast_test2(volk_fn_2arg func, void *outbuff, std::vector &inbuffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(outbuff, inbuffs[0], vlen, arch.c_str()); +} + +inline void run_cast_test3(volk_fn_3arg func, void *outbuff, std::vector &inbuffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(outbuff, inbuffs[0], inbuffs[1], vlen, arch.c_str()); +} + +inline void run_cast_test4(volk_fn_4arg func, void *outbuff, std::vector &inbuffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(outbuff, inbuffs[0], inbuffs[1], inbuffs[2], vlen, arch.c_str()); +} + +bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, float tol, int vlen, int iter) { + std::cout << "RUN_VOLK_TESTS: " << name << std::endl; + + //first let's get a list of available architectures for the test + std::vector arch_list = get_arch_list(archs); + + BOOST_FOREACH(std::string arch, arch_list) { + std::cout << "Found an arch: " << arch << std::endl; + } + + //now we have to get a function signature by parsing the name + std::vector inputsig, outputsig; + get_function_signature(inputsig, outputsig, name); + + for(int i=0; i inbuffs; + make_buffer_for_signature(inbuffs, inputsig, vlen); + + //and set the input buffers to something random + //TODO + + //allocate output buffers -- one for each output for each arch + std::vector outbuffs; + BOOST_FOREACH(std::string arch, arch_list) { + make_buffer_for_signature(outbuffs, outputsig, vlen); + } + + //now run the test + clock_t start, end; + for(int i = 0; i < arch_list.size(); i++) { + start = clock(); + switch(outputsig.size()+inputsig.size()) { + case 2: + run_cast_test2((volk_fn_2arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); + break; + case 3: + run_cast_test3((volk_fn_3arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); + break; + case 4: + run_cast_test4((volk_fn_4arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); + break; + default: + break; + } + end = clock(); + std::cout << arch_list[i] << " completed in " << (double)(end-start)/(double)CLOCKS_PER_SEC << "s" << std::endl; + } + + //and now compare each output to the generic output + //first we have to know which output is the generic one, they aren't in order... + int generic_offset; + for(int i=0; i +#include +#include + +float uniform(void); +void random_floats(float *buf, unsigned n); + +bool run_volk_tests(const int[], void(*)(), std::string, float, int, int); + +#define VOLK_RUN_TESTS(func, tol, len, iter) run_volk_tests(func##_arch_defs, (void (*)())func##_manual, std::string(#func), tol, len, iter) + +typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*); +typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*); +typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*); + +#endif //VOLK_QA_UTILS_H diff --git a/volk/lib/qa_volk.cc b/volk/lib/qa_volk.cc index c3c27b69b..8e7e59768 100644 --- a/volk/lib/qa_volk.cc +++ b/volk/lib/qa_volk.cc @@ -143,7 +143,7 @@ qa_volk::suite() s->addTest(qa_32f_dot_prod_aligned16::suite()); s->addTest(qa_32f_dot_prod_unaligned16::suite()); s->addTest(qa_32f_fm_detect_aligned16::suite()); - s->addTest(qa_32fc_32f_multiply_aligned16::suite()); + //s->addTest(qa_32fc_32f_multiply_aligned16::suite()); s->addTest(qa_32fc_multiply_aligned16::suite()); s->addTest(qa_32f_divide_aligned16::suite()); s->addTest(qa_32f_multiply_aligned16::suite()); -- cgit From 9a527257014878cac993ffe854bf8fdacc412be6 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 14 Jan 2011 13:07:06 -0800 Subject: Volk: QA code fixes, more Orc routines. Broke the 32fc_multiply Orc impl because I'm lame and lost some work. Fixed volk_8s_convert_16s Orc impl. Still need to rename functions and modify the QA sig parser to match. Then rewrite makefiles. --- volk/lib/qa_utils.cc | 94 +++++++++++++++++----- volk/lib/qa_utils.h | 2 +- volk/orc/volk_32f_normalize_aligned16_orc_impl.orc | 5 ++ .../volk_32fc_32f_multiply_aligned16_orc_impl.orc | 7 ++ volk/orc/volk_32fc_multiply_aligned16_orc_impl.orc | 6 ++ .../orc/volk_8s_convert_16s_aligned16_orc_impl.orc | 3 +- 6 files changed, 97 insertions(+), 20 deletions(-) create mode 100644 volk/orc/volk_32f_normalize_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32fc_32f_multiply_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32fc_multiply_aligned16_orc_impl.orc (limited to 'volk') diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index 4d93ca62a..fa21db487 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -3,7 +3,7 @@ #include #include #include -#include +//#include #include #include #include @@ -13,19 +13,39 @@ #include #include #include -//#include float uniform() { return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) } -void -random_floats (float *buf, unsigned n) +void random_floats (float *buf, unsigned n) { for (unsigned i = 0; i < n; i++) buf[i] = uniform (); } +void load_random_data(void *data, std::string sig, unsigned int n) { + if(sig == "32fc") { + random_floats((float *)data, n*2); + } else if(sig == "32f") { + random_floats((float *)data, n); + } else if(sig == "32u") { + for(int i=0; i((RAND_MAX/2))) * 32768.0)); + } else if(sig == "16sc") { + for(int i=0; i((RAND_MAX/2))) * 32768.0)); + } else if(sig == "8u") { + for(int i=0; i((RAND_MAX/2)) * 256.0)); + } else if(sig == "8s") { + for(int i=0; i((RAND_MAX/2)) * 128.0)); + } else std::cout << "load_random_data(): Invalid sig: " << sig << std::endl; +} + template t *make_aligned_buffer(unsigned int len) { t *buf; @@ -37,11 +57,11 @@ t *make_aligned_buffer(unsigned int len) { void make_buffer_for_signature(std::vector &buffs, std::vector inputsig, unsigned int vlen) { BOOST_FOREACH(std::string sig, inputsig) { - if (sig=="32fc" || sig=="64f") buffs.push_back((void *) make_aligned_buffer(vlen)); - else if(sig=="32f" || sig=="32u" || sig=="32s" || sig=="16sc") buffs.push_back((void *) make_aligned_buffer(vlen)); - else if(sig=="16s" || sig=="16u") buffs.push_back((void *) make_aligned_buffer(vlen)); - else if(sig=="8s" || sig=="8u") buffs.push_back((void *) make_aligned_buffer(vlen)); - else std::cout << "Invalid type!" << std::endl; + if (sig=="32fc" || sig=="64f" || sig=="64u") buffs.push_back((void *) make_aligned_buffer(vlen)); + else if(sig=="32f" || sig=="32u" || sig=="32s" || sig=="16sc") buffs.push_back((void *) make_aligned_buffer(vlen)); + else if(sig=="16s" || sig=="16u" || sig=="8sc") buffs.push_back((void *) make_aligned_buffer(vlen)); + else if(sig=="8s" || sig=="8u") buffs.push_back((void *) make_aligned_buffer(vlen)); + else std::cout << "Invalid type: " << sig << std::endl; } } @@ -90,7 +110,7 @@ static std::vector get_arch_list(const int archs[]) { } static bool is_valid_type(std::string type) { - std::vector valid_types = boost::assign::list_of("32fc")("32f")("32s")("32u")("16sc")("16s")("16u")("8s")("8u"); + std::vector valid_types = boost::assign::list_of("64f")("64u")("32fc")("32f")("32s")("32u")("16sc")("16s")("16u")("8s")("8sc")("8u"); BOOST_FOREACH(std::string this_type, valid_types) { if(type == this_type) return true; @@ -120,16 +140,23 @@ static void get_function_signature(std::vector &inputsig, } while(!valid_type && pos < toked.size()) { if(is_valid_type(toked[pos])) valid_type = true; - pos++; + else pos++; } while(valid_type && pos < toked.size()) { if(is_valid_type(toked[pos])) outputsig.push_back(toked[pos]); else valid_type = false; pos++; } - - //if there's no explicit output sig then assume the output is the same as the first input - if(outputsig.size() == 0) outputsig.push_back(inputsig[0]); + + //if there's no output sig and only one input sig, assume there are 2 inputs + //this handles conversion fn's (which have a specified output sig) and most of the rest + if(outputsig.size() == 0 && inputsig.size() == 1) { + outputsig.push_back(inputsig[0]); + inputsig.push_back(inputsig[0]); + }//if there's no explicit output sig then assume the output is the same as the first input + else if(outputsig.size() == 0) outputsig.push_back(inputsig[0]); + + assert(inputsig.size() != 0); assert(outputsig.size() != 0); } @@ -168,7 +195,9 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, make_buffer_for_signature(inbuffs, inputsig, vlen); //and set the input buffers to something random - //TODO + for(int i=0; i outbuffs; @@ -204,9 +233,38 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, if(arch_list[i] == "generic") generic_offset=i; for(int i=0; i tol) { + std::cout << "Generic: " << ((float *)(outbuffs[generic_offset]))[j] << " " << arch_list[i] << ": " << ((float *)(outbuffs[i]))[j] << std::endl; + return 1; + } + } + } else if(outputsig[0] == "32f") { + for(int j=0; j tol) { + std::cout << "Generic: " << ((float *)(outbuffs[generic_offset]))[j] << " " << arch_list[i] << ": " << ((float *)(outbuffs[i]))[j] << std::endl; + return 1; + } + } + } else if(outputsig[0] == "32u" || outputsig[0] == "32s" || outputsig[0] == "16sc") { + for(int j=0; j 120000 volk/config.guess mode change 100755 => 120000 volk/config.sub create mode 100644 volk/include/volk/volk_16s_add_quad_a16.h delete mode 100644 volk/include/volk/volk_16s_add_quad_aligned16.h create mode 100644 volk/include/volk/volk_16s_branch_4_state_8_a16.h delete mode 100644 volk/include/volk/volk_16s_branch_4_state_8_aligned16.h delete mode 100644 volk/include/volk/volk_16s_convert_32f_aligned16.h delete mode 100644 volk/include/volk/volk_16s_convert_32f_unaligned16.h create mode 100644 volk/include/volk/volk_16s_convert_8s_a16.h delete mode 100644 volk/include/volk/volk_16s_convert_8s_aligned16.h create mode 100644 volk/include/volk/volk_16s_convert_8s_ua16.h delete mode 100644 volk/include/volk/volk_16s_convert_8s_unaligned16.h create mode 100644 volk/include/volk/volk_16s_max_star_16s_a16.h delete mode 100644 volk/include/volk/volk_16s_max_star_aligned16.h create mode 100644 volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h delete mode 100644 volk/include/volk/volk_16s_max_star_horizontal_aligned16.h create mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_a16.h delete mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h create mode 100644 volk/include/volk/volk_16s_quad_max_star_16s_a16.h delete mode 100644 volk/include/volk/volk_16s_quad_max_star_aligned16.h create mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_ua16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h create mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h create mode 100644 volk/include/volk/volk_16sc_magnitude_16s_a16.h delete mode 100644 volk/include/volk/volk_16sc_magnitude_16s_aligned16.h delete mode 100644 volk/include/volk/volk_16sc_magnitude_32f_aligned16.h create mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h create mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h create mode 100644 volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h create mode 100644 volk/include/volk/volk_16u_byteswap_a16.h delete mode 100644 volk/include/volk/volk_16u_byteswap_aligned16.h create mode 100644 volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h create mode 100644 volk/include/volk/volk_32f_32f_add_32f_a16.h create mode 100644 volk/include/volk/volk_32f_32f_divide_32f_a16.h create mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h create mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h create mode 100644 volk/include/volk/volk_32f_32f_interleave_32fc_a16.h create mode 100644 volk/include/volk/volk_32f_32f_max_32f_a16.h create mode 100644 volk/include/volk/volk_32f_32f_min_32f_a16.h create mode 100644 volk/include/volk/volk_32f_32f_multiply_32f_a16.h create mode 100644 volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h create mode 100644 volk/include/volk/volk_32f_32f_subtract_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_accumulator_aligned16.h create mode 100644 volk/include/volk/volk_32f_accumulator_s32f_a16.h delete mode 100644 volk/include/volk/volk_32f_add_aligned16.h create mode 100644 volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h delete mode 100644 volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h delete mode 100644 volk/include/volk/volk_32f_convert_16s_aligned16.h delete mode 100644 volk/include/volk/volk_32f_convert_16s_unaligned16.h delete mode 100644 volk/include/volk/volk_32f_convert_32s_aligned16.h delete mode 100644 volk/include/volk/volk_32f_convert_32s_unaligned16.h create mode 100644 volk/include/volk/volk_32f_convert_64f_a16.h delete mode 100644 volk/include/volk/volk_32f_convert_64f_aligned16.h create mode 100644 volk/include/volk/volk_32f_convert_64f_ua16.h delete mode 100644 volk/include/volk/volk_32f_convert_64f_unaligned16.h delete mode 100644 volk/include/volk/volk_32f_convert_8s_aligned16.h delete mode 100644 volk/include/volk/volk_32f_convert_8s_unaligned16.h delete mode 100644 volk/include/volk/volk_32f_divide_aligned16.h delete mode 100644 volk/include/volk/volk_32f_dot_prod_aligned16.h delete mode 100644 volk/include/volk/volk_32f_dot_prod_unaligned16.h delete mode 100644 volk/include/volk/volk_32f_fm_detect_aligned16.h create mode 100644 volk/include/volk/volk_32f_index_max_16u_a16.h delete mode 100644 volk/include/volk/volk_32f_index_max_aligned16.h delete mode 100644 volk/include/volk/volk_32f_interleave_16sc_aligned16.h delete mode 100644 volk/include/volk/volk_32f_interleave_32fc_aligned16.h delete mode 100644 volk/include/volk/volk_32f_max_aligned16.h delete mode 100644 volk/include/volk/volk_32f_min_aligned16.h delete mode 100644 volk/include/volk/volk_32f_multiply_aligned16.h delete mode 100644 volk/include/volk/volk_32f_normalize_aligned16.h delete mode 100644 volk/include/volk/volk_32f_power_aligned16.h create mode 100644 volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_ua16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_ua16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_ua16.h create mode 100644 volk/include/volk/volk_32f_s32f_normalize_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_power_32f_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_stddev_32f_a16.h create mode 100644 volk/include/volk/volk_32f_sqrt_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_sqrt_aligned16.h delete mode 100644 volk/include/volk/volk_32f_stddev_aligned16.h create mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_stddev_and_mean_aligned16.h delete mode 100644 volk/include/volk/volk_32f_subtract_aligned16.h delete mode 100644 volk/include/volk/volk_32f_sum_of_poly_aligned16.h create mode 100644 volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h delete mode 100644 volk/include/volk/volk_32fc_32f_multiply_aligned16.h create mode 100644 volk/include/volk/volk_32fc_32f_power_32fc_a16.h delete mode 100644 volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h create mode 100644 volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_atan2_32f_aligned16.h delete mode 100644 volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h delete mode 100644 volk/include/volk/volk_32fc_dot_prod_aligned16.h create mode 100644 volk/include/volk/volk_32fc_index_max_16u_a16.h delete mode 100644 volk/include/volk/volk_32fc_index_max_aligned16.h delete mode 100644 volk/include/volk/volk_32fc_magnitude_16s_aligned16.h create mode 100644 volk/include/volk/volk_32fc_magnitude_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_magnitude_32f_aligned16.h delete mode 100644 volk/include/volk/volk_32fc_multiply_aligned16.h delete mode 100644 volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h delete mode 100644 volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h create mode 100644 volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_square_dist_aligned16.h delete mode 100644 volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h create mode 100644 volk/include/volk/volk_32s_32s_and_32s_a16.h create mode 100644 volk/include/volk/volk_32s_32s_or_32s_a16.h delete mode 100644 volk/include/volk/volk_32s_and_aligned16.h delete mode 100644 volk/include/volk/volk_32s_convert_32f_aligned16.h delete mode 100644 volk/include/volk/volk_32s_convert_32f_unaligned16.h delete mode 100644 volk/include/volk/volk_32s_or_aligned16.h create mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_ua16.h create mode 100644 volk/include/volk/volk_32u_byteswap_a16.h delete mode 100644 volk/include/volk/volk_32u_byteswap_aligned16.h create mode 100644 volk/include/volk/volk_32u_popcnt_a16.h delete mode 100644 volk/include/volk/volk_32u_popcnt_aligned16.h create mode 100644 volk/include/volk/volk_64f_64f_max_64f_a16.h create mode 100644 volk/include/volk/volk_64f_64f_min_64f_a16.h create mode 100644 volk/include/volk/volk_64f_convert_32f_a16.h delete mode 100644 volk/include/volk/volk_64f_convert_32f_aligned16.h create mode 100644 volk/include/volk/volk_64f_convert_32f_ua16.h delete mode 100644 volk/include/volk/volk_64f_convert_32f_unaligned16.h delete mode 100644 volk/include/volk/volk_64f_max_aligned16.h delete mode 100644 volk/include/volk/volk_64f_min_aligned16.h create mode 100644 volk/include/volk/volk_64u_byteswap_a16.h delete mode 100644 volk/include/volk/volk_64u_byteswap_aligned16.h create mode 100644 volk/include/volk/volk_64u_popcnt_a16.h delete mode 100644 volk/include/volk/volk_64u_popcnt_aligned16.h create mode 100644 volk/include/volk/volk_8s_convert_16s_a16.h delete mode 100644 volk/include/volk/volk_8s_convert_16s_aligned16.h create mode 100644 volk/include/volk/volk_8s_convert_16s_ua16.h delete mode 100644 volk/include/volk/volk_8s_convert_16s_unaligned16.h delete mode 100644 volk/include/volk/volk_8s_convert_32f_aligned16.h delete mode 100644 volk/include/volk/volk_8s_convert_32f_unaligned16.h create mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_ua16.h create mode 100644 volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h create mode 100644 volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h create mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h delete mode 100644 volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h delete mode 100644 volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h create mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h create mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h create mode 100644 volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc delete mode 100644 volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc delete mode 100644 volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc delete mode 100644 volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc delete mode 100644 volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc create mode 100644 volk/orc/volk_16u_byteswap_a16_orc_impl.orc delete mode 100644 volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc create mode 100644 volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc create mode 100644 volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc create mode 100644 volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc create mode 100644 volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc create mode 100644 volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32f_add_aligned16_orc_impl.orc delete mode 100644 volk/orc/volk_32f_divide_aligned16_orc_impl.orc delete mode 100644 volk/orc/volk_32f_max_aligned16_orc_impl.orc delete mode 100644 volk/orc/volk_32f_min_aligned16_orc_impl.orc delete mode 100644 volk/orc/volk_32f_multiply_aligned16_orc_impl.orc delete mode 100644 volk/orc/volk_32f_normalize_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32f_s32f_normalize_a16_orc_impl.orc create mode 100644 volk/orc/volk_32f_sqrt_32f_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32f_sqrt_aligned16_orc_impl.orc delete mode 100644 volk/orc/volk_32f_subtract_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32fc_32f_multiply_32fc_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32fc_32f_multiply_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32fc_magnitude_32f_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc delete mode 100644 volk/orc/volk_32fc_multiply_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc create mode 100644 volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc create mode 100644 volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32s_and_aligned16_orc_impl.orc delete mode 100644 volk/orc/volk_32s_or_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_8s_convert_16s_a16_orc_impl.orc delete mode 100644 volk/orc/volk_8s_convert_16s_aligned16_orc_impl.orc delete mode 100644 volk/orc/volk_8s_convert_32f_aligned16_orc_impl.orc create mode 100644 volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc (limited to 'volk') diff --git a/volk/config.guess b/volk/config.guess deleted file mode 100755 index 285237846..000000000 --- a/volk/config.guess +++ /dev/null @@ -1,1505 +0,0 @@ -#! /bin/sh -# Attempt to guess a canonical system name. -# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, -# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 -# Free Software Foundation, Inc. - -timestamp='2010-08-21' - -# This file is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA -# 02110-1301, USA. -# -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - - -# Originally written by Per Bothner. Please send patches (context -# diff format) to and include a ChangeLog -# entry. -# -# This script attempts to guess a canonical system name similar to -# config.sub. If it succeeds, it prints the system name on stdout, and -# exits with 0. Otherwise, it exits with 1. -# -# You can get the latest version of this script from: -# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD - -me=`echo "$0" | sed -e 's,.*/,,'` - -usage="\ -Usage: $0 [OPTION] - -Output the configuration name of the system \`$me' is run on. - -Operation modes: - -h, --help print this help, then exit - -t, --time-stamp print date of last modification, then exit - -v, --version print version number, then exit - -Report bugs and patches to ." - -version="\ -GNU config.guess ($timestamp) - -Originally written by Per Bothner. -Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, -2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free -Software Foundation, Inc. - -This is free software; see the source for copying conditions. There is NO -warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." - -help=" -Try \`$me --help' for more information." - -# Parse command line -while test $# -gt 0 ; do - case $1 in - --time-stamp | --time* | -t ) - echo "$timestamp" ; exit ;; - --version | -v ) - echo "$version" ; exit ;; - --help | --h* | -h ) - echo "$usage"; exit ;; - -- ) # Stop option processing - shift; break ;; - - ) # Use stdin as input. - break ;; - -* ) - echo "$me: invalid option $1$help" >&2 - exit 1 ;; - * ) - break ;; - esac -done - -if test $# != 0; then - echo "$me: too many arguments$help" >&2 - exit 1 -fi - -trap 'exit 1' HUP INT TERM - -# CC_FOR_BUILD -- compiler used by this script. Note that the use of a -# compiler to aid in system detection is discouraged as it requires -# temporary files to be created and, as you can see below, it is a -# headache to deal with in a portable fashion. - -# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still -# use `HOST_CC' if defined, but it is deprecated. - -# Portable tmp directory creation inspired by the Autoconf team. - -set_cc_for_build=' -trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ; -trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" HUP INT PIPE TERM ; -: ${TMPDIR=/tmp} ; - { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || - { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } || - { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } || - { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ; -dummy=$tmp/dummy ; -tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; -case $CC_FOR_BUILD,$HOST_CC,$CC in - ,,) echo "int x;" > $dummy.c ; - for c in cc gcc c89 c99 ; do - if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then - CC_FOR_BUILD="$c"; break ; - fi ; - done ; - if test x"$CC_FOR_BUILD" = x ; then - CC_FOR_BUILD=no_compiler_found ; - fi - ;; - ,,*) CC_FOR_BUILD=$CC ;; - ,*,*) CC_FOR_BUILD=$HOST_CC ;; -esac ; set_cc_for_build= ;' - -# This is needed to find uname on a Pyramid OSx when run in the BSD universe. -# (ghazi@noc.rutgers.edu 1994-08-24) -if (test -f /.attbin/uname) >/dev/null 2>&1 ; then - PATH=$PATH:/.attbin ; export PATH -fi - -UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown -UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown -UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown -UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown - -# Note: order is significant - the case branches are not exclusive. - -case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in - *:NetBSD:*:*) - # NetBSD (nbsd) targets should (where applicable) match one or - # more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*, - # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently - # switched to ELF, *-*-netbsd* would select the old - # object file format. This provides both forward - # compatibility and a consistent mechanism for selecting the - # object file format. - # - # Note: NetBSD doesn't particularly care about the vendor - # portion of the name. We always set it to "unknown". - sysctl="sysctl -n hw.machine_arch" - UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \ - /usr/sbin/$sysctl 2>/dev/null || echo unknown)` - case "${UNAME_MACHINE_ARCH}" in - armeb) machine=armeb-unknown ;; - arm*) machine=arm-unknown ;; - sh3el) machine=shl-unknown ;; - sh3eb) machine=sh-unknown ;; - sh5el) machine=sh5le-unknown ;; - *) machine=${UNAME_MACHINE_ARCH}-unknown ;; - esac - # The Operating System including object format, if it has switched - # to ELF recently, or will in the future. - case "${UNAME_MACHINE_ARCH}" in - arm*|i386|m68k|ns32k|sh3*|sparc|vax) - eval $set_cc_for_build - if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ - | grep -q __ELF__ - then - # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). - # Return netbsd for either. FIX? - os=netbsd - else - os=netbsdelf - fi - ;; - *) - os=netbsd - ;; - esac - # The OS release - # Debian GNU/NetBSD machines have a different userland, and - # thus, need a distinct triplet. However, they do not need - # kernel version information, so it can be replaced with a - # suitable tag, in the style of linux-gnu. - case "${UNAME_VERSION}" in - Debian*) - release='-gnu' - ;; - *) - release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` - ;; - esac - # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: - # contains redundant information, the shorter form: - # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. - echo "${machine}-${os}${release}" - exit ;; - *:OpenBSD:*:*) - UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` - echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE} - exit ;; - *:ekkoBSD:*:*) - echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE} - exit ;; - *:SolidBSD:*:*) - echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE} - exit ;; - macppc:MirBSD:*:*) - echo powerpc-unknown-mirbsd${UNAME_RELEASE} - exit ;; - *:MirBSD:*:*) - echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE} - exit ;; - alpha:OSF1:*:*) - case $UNAME_RELEASE in - *4.0) - UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` - ;; - *5.*) - UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` - ;; - esac - # According to Compaq, /usr/sbin/psrinfo has been available on - # OSF/1 and Tru64 systems produced since 1995. I hope that - # covers most systems running today. This code pipes the CPU - # types through head -n 1, so we only detect the type of CPU 0. - ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` - case "$ALPHA_CPU_TYPE" in - "EV4 (21064)") - UNAME_MACHINE="alpha" ;; - "EV4.5 (21064)") - UNAME_MACHINE="alpha" ;; - "LCA4 (21066/21068)") - UNAME_MACHINE="alpha" ;; - "EV5 (21164)") - UNAME_MACHINE="alphaev5" ;; - "EV5.6 (21164A)") - UNAME_MACHINE="alphaev56" ;; - "EV5.6 (21164PC)") - UNAME_MACHINE="alphapca56" ;; - "EV5.7 (21164PC)") - UNAME_MACHINE="alphapca57" ;; - "EV6 (21264)") - UNAME_MACHINE="alphaev6" ;; - "EV6.7 (21264A)") - UNAME_MACHINE="alphaev67" ;; - "EV6.8CB (21264C)") - UNAME_MACHINE="alphaev68" ;; - "EV6.8AL (21264B)") - UNAME_MACHINE="alphaev68" ;; - "EV6.8CX (21264D)") - UNAME_MACHINE="alphaev68" ;; - "EV6.9A (21264/EV69A)") - UNAME_MACHINE="alphaev69" ;; - "EV7 (21364)") - UNAME_MACHINE="alphaev7" ;; - "EV7.9 (21364A)") - UNAME_MACHINE="alphaev79" ;; - esac - # A Pn.n version is a patched version. - # A Vn.n version is a released version. - # A Tn.n version is a released field test version. - # A Xn.n version is an unreleased experimental baselevel. - # 1.2 uses "1.2" for uname -r. - echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` - exit ;; - Alpha\ *:Windows_NT*:*) - # How do we know it's Interix rather than the generic POSIX subsystem? - # Should we change UNAME_MACHINE based on the output of uname instead - # of the specific Alpha model? - echo alpha-pc-interix - exit ;; - 21064:Windows_NT:50:3) - echo alpha-dec-winnt3.5 - exit ;; - Amiga*:UNIX_System_V:4.0:*) - echo m68k-unknown-sysv4 - exit ;; - *:[Aa]miga[Oo][Ss]:*:*) - echo ${UNAME_MACHINE}-unknown-amigaos - exit ;; - *:[Mm]orph[Oo][Ss]:*:*) - echo ${UNAME_MACHINE}-unknown-morphos - exit ;; - *:OS/390:*:*) - echo i370-ibm-openedition - exit ;; - *:z/VM:*:*) - echo s390-ibm-zvmoe - exit ;; - *:OS400:*:*) - echo powerpc-ibm-os400 - exit ;; - arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) - echo arm-acorn-riscix${UNAME_RELEASE} - exit ;; - arm:riscos:*:*|arm:RISCOS:*:*) - echo arm-unknown-riscos - exit ;; - SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) - echo hppa1.1-hitachi-hiuxmpp - exit ;; - Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) - # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. - if test "`(/bin/universe) 2>/dev/null`" = att ; then - echo pyramid-pyramid-sysv3 - else - echo pyramid-pyramid-bsd - fi - exit ;; - NILE*:*:*:dcosx) - echo pyramid-pyramid-svr4 - exit ;; - DRS?6000:unix:4.0:6*) - echo sparc-icl-nx6 - exit ;; - DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) - case `/usr/bin/uname -p` in - sparc) echo sparc-icl-nx7; exit ;; - esac ;; - s390x:SunOS:*:*) - echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` - exit ;; - sun4H:SunOS:5.*:*) - echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` - exit ;; - sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) - echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` - exit ;; - i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*) - echo i386-pc-auroraux${UNAME_RELEASE} - exit ;; - i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) - eval $set_cc_for_build - SUN_ARCH="i386" - # If there is a compiler, see if it is configured for 64-bit objects. - # Note that the Sun cc does not turn __LP64__ into 1 like gcc does. - # This test works for both compilers. - if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then - if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \ - (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ - grep IS_64BIT_ARCH >/dev/null - then - SUN_ARCH="x86_64" - fi - fi - echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` - exit ;; - sun4*:SunOS:6*:*) - # According to config.sub, this is the proper way to canonicalize - # SunOS6. Hard to guess exactly what SunOS6 will be like, but - # it's likely to be more like Solaris than SunOS4. - echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` - exit ;; - sun4*:SunOS:*:*) - case "`/usr/bin/arch -k`" in - Series*|S4*) - UNAME_RELEASE=`uname -v` - ;; - esac - # Japanese Language versions have a version number like `4.1.3-JL'. - echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` - exit ;; - sun3*:SunOS:*:*) - echo m68k-sun-sunos${UNAME_RELEASE} - exit ;; - sun*:*:4.2BSD:*) - UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` - test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 - case "`/bin/arch`" in - sun3) - echo m68k-sun-sunos${UNAME_RELEASE} - ;; - sun4) - echo sparc-sun-sunos${UNAME_RELEASE} - ;; - esac - exit ;; - aushp:SunOS:*:*) - echo sparc-auspex-sunos${UNAME_RELEASE} - exit ;; - # The situation for MiNT is a little confusing. The machine name - # can be virtually everything (everything which is not - # "atarist" or "atariste" at least should have a processor - # > m68000). The system name ranges from "MiNT" over "FreeMiNT" - # to the lowercase version "mint" (or "freemint"). Finally - # the system name "TOS" denotes a system which is actually not - # MiNT. But MiNT is downward compatible to TOS, so this should - # be no problem. - atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) - echo m68k-atari-mint${UNAME_RELEASE} - exit ;; - atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) - echo m68k-atari-mint${UNAME_RELEASE} - exit ;; - *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) - echo m68k-atari-mint${UNAME_RELEASE} - exit ;; - milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) - echo m68k-milan-mint${UNAME_RELEASE} - exit ;; - hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) - echo m68k-hades-mint${UNAME_RELEASE} - exit ;; - *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) - echo m68k-unknown-mint${UNAME_RELEASE} - exit ;; - m68k:machten:*:*) - echo m68k-apple-machten${UNAME_RELEASE} - exit ;; - powerpc:machten:*:*) - echo powerpc-apple-machten${UNAME_RELEASE} - exit ;; - RISC*:Mach:*:*) - echo mips-dec-mach_bsd4.3 - exit ;; - RISC*:ULTRIX:*:*) - echo mips-dec-ultrix${UNAME_RELEASE} - exit ;; - VAX*:ULTRIX*:*:*) - echo vax-dec-ultrix${UNAME_RELEASE} - exit ;; - 2020:CLIX:*:* | 2430:CLIX:*:*) - echo clipper-intergraph-clix${UNAME_RELEASE} - exit ;; - mips:*:*:UMIPS | mips:*:*:RISCos) - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c -#ifdef __cplusplus -#include /* for printf() prototype */ - int main (int argc, char *argv[]) { -#else - int main (argc, argv) int argc; char *argv[]; { -#endif - #if defined (host_mips) && defined (MIPSEB) - #if defined (SYSTYPE_SYSV) - printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); - #endif - #if defined (SYSTYPE_SVR4) - printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); - #endif - #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) - printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); - #endif - #endif - exit (-1); - } -EOF - $CC_FOR_BUILD -o $dummy $dummy.c && - dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` && - SYSTEM_NAME=`$dummy $dummyarg` && - { echo "$SYSTEM_NAME"; exit; } - echo mips-mips-riscos${UNAME_RELEASE} - exit ;; - Motorola:PowerMAX_OS:*:*) - echo powerpc-motorola-powermax - exit ;; - Motorola:*:4.3:PL8-*) - echo powerpc-harris-powermax - exit ;; - Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) - echo powerpc-harris-powermax - exit ;; - Night_Hawk:Power_UNIX:*:*) - echo powerpc-harris-powerunix - exit ;; - m88k:CX/UX:7*:*) - echo m88k-harris-cxux7 - exit ;; - m88k:*:4*:R4*) - echo m88k-motorola-sysv4 - exit ;; - m88k:*:3*:R3*) - echo m88k-motorola-sysv3 - exit ;; - AViiON:dgux:*:*) - # DG/UX returns AViiON for all architectures - UNAME_PROCESSOR=`/usr/bin/uname -p` - if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ] - then - if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \ - [ ${TARGET_BINARY_INTERFACE}x = x ] - then - echo m88k-dg-dgux${UNAME_RELEASE} - else - echo m88k-dg-dguxbcs${UNAME_RELEASE} - fi - else - echo i586-dg-dgux${UNAME_RELEASE} - fi - exit ;; - M88*:DolphinOS:*:*) # DolphinOS (SVR3) - echo m88k-dolphin-sysv3 - exit ;; - M88*:*:R3*:*) - # Delta 88k system running SVR3 - echo m88k-motorola-sysv3 - exit ;; - XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) - echo m88k-tektronix-sysv3 - exit ;; - Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) - echo m68k-tektronix-bsd - exit ;; - *:IRIX*:*:*) - echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` - exit ;; - ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. - echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id - exit ;; # Note that: echo "'`uname -s`'" gives 'AIX ' - i*86:AIX:*:*) - echo i386-ibm-aix - exit ;; - ia64:AIX:*:*) - if [ -x /usr/bin/oslevel ] ; then - IBM_REV=`/usr/bin/oslevel` - else - IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} - fi - echo ${UNAME_MACHINE}-ibm-aix${IBM_REV} - exit ;; - *:AIX:2:3) - if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - #include - - main() - { - if (!__power_pc()) - exit(1); - puts("powerpc-ibm-aix3.2.5"); - exit(0); - } -EOF - if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` - then - echo "$SYSTEM_NAME" - else - echo rs6000-ibm-aix3.2.5 - fi - elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then - echo rs6000-ibm-aix3.2.4 - else - echo rs6000-ibm-aix3.2 - fi - exit ;; - *:AIX:*:[4567]) - IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` - if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then - IBM_ARCH=rs6000 - else - IBM_ARCH=powerpc - fi - if [ -x /usr/bin/oslevel ] ; then - IBM_REV=`/usr/bin/oslevel` - else - IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} - fi - echo ${IBM_ARCH}-ibm-aix${IBM_REV} - exit ;; - *:AIX:*:*) - echo rs6000-ibm-aix - exit ;; - ibmrt:4.4BSD:*|romp-ibm:BSD:*) - echo romp-ibm-bsd4.4 - exit ;; - ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and - echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to - exit ;; # report: romp-ibm BSD 4.3 - *:BOSX:*:*) - echo rs6000-bull-bosx - exit ;; - DPX/2?00:B.O.S.:*:*) - echo m68k-bull-sysv3 - exit ;; - 9000/[34]??:4.3bsd:1.*:*) - echo m68k-hp-bsd - exit ;; - hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) - echo m68k-hp-bsd4.4 - exit ;; - 9000/[34678]??:HP-UX:*:*) - HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` - case "${UNAME_MACHINE}" in - 9000/31? ) HP_ARCH=m68000 ;; - 9000/[34]?? ) HP_ARCH=m68k ;; - 9000/[678][0-9][0-9]) - if [ -x /usr/bin/getconf ]; then - sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` - sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` - case "${sc_cpu_version}" in - 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 - 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 - 532) # CPU_PA_RISC2_0 - case "${sc_kernel_bits}" in - 32) HP_ARCH="hppa2.0n" ;; - 64) HP_ARCH="hppa2.0w" ;; - '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20 - esac ;; - esac - fi - if [ "${HP_ARCH}" = "" ]; then - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - - #define _HPUX_SOURCE - #include - #include - - int main () - { - #if defined(_SC_KERNEL_BITS) - long bits = sysconf(_SC_KERNEL_BITS); - #endif - long cpu = sysconf (_SC_CPU_VERSION); - - switch (cpu) - { - case CPU_PA_RISC1_0: puts ("hppa1.0"); break; - case CPU_PA_RISC1_1: puts ("hppa1.1"); break; - case CPU_PA_RISC2_0: - #if defined(_SC_KERNEL_BITS) - switch (bits) - { - case 64: puts ("hppa2.0w"); break; - case 32: puts ("hppa2.0n"); break; - default: puts ("hppa2.0"); break; - } break; - #else /* !defined(_SC_KERNEL_BITS) */ - puts ("hppa2.0"); break; - #endif - default: puts ("hppa1.0"); break; - } - exit (0); - } -EOF - (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` - test -z "$HP_ARCH" && HP_ARCH=hppa - fi ;; - esac - if [ ${HP_ARCH} = "hppa2.0w" ] - then - eval $set_cc_for_build - - # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating - # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler - # generating 64-bit code. GNU and HP use different nomenclature: - # - # $ CC_FOR_BUILD=cc ./config.guess - # => hppa2.0w-hp-hpux11.23 - # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess - # => hppa64-hp-hpux11.23 - - if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | - grep -q __LP64__ - then - HP_ARCH="hppa2.0w" - else - HP_ARCH="hppa64" - fi - fi - echo ${HP_ARCH}-hp-hpux${HPUX_REV} - exit ;; - ia64:HP-UX:*:*) - HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` - echo ia64-hp-hpux${HPUX_REV} - exit ;; - 3050*:HI-UX:*:*) - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - #include - int - main () - { - long cpu = sysconf (_SC_CPU_VERSION); - /* The order matters, because CPU_IS_HP_MC68K erroneously returns - true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct - results, however. */ - if (CPU_IS_PA_RISC (cpu)) - { - switch (cpu) - { - case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; - case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; - case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; - default: puts ("hppa-hitachi-hiuxwe2"); break; - } - } - else if (CPU_IS_HP_MC68K (cpu)) - puts ("m68k-hitachi-hiuxwe2"); - else puts ("unknown-hitachi-hiuxwe2"); - exit (0); - } -EOF - $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` && - { echo "$SYSTEM_NAME"; exit; } - echo unknown-hitachi-hiuxwe2 - exit ;; - 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) - echo hppa1.1-hp-bsd - exit ;; - 9000/8??:4.3bsd:*:*) - echo hppa1.0-hp-bsd - exit ;; - *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) - echo hppa1.0-hp-mpeix - exit ;; - hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) - echo hppa1.1-hp-osf - exit ;; - hp8??:OSF1:*:*) - echo hppa1.0-hp-osf - exit ;; - i*86:OSF1:*:*) - if [ -x /usr/sbin/sysversion ] ; then - echo ${UNAME_MACHINE}-unknown-osf1mk - else - echo ${UNAME_MACHINE}-unknown-osf1 - fi - exit ;; - parisc*:Lites*:*:*) - echo hppa1.1-hp-lites - exit ;; - C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) - echo c1-convex-bsd - exit ;; - C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) - if getsysinfo -f scalar_acc - then echo c32-convex-bsd - else echo c2-convex-bsd - fi - exit ;; - C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) - echo c34-convex-bsd - exit ;; - C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) - echo c38-convex-bsd - exit ;; - C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) - echo c4-convex-bsd - exit ;; - CRAY*Y-MP:*:*:*) - echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' - exit ;; - CRAY*[A-Z]90:*:*:*) - echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ - | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ - -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ - -e 's/\.[^.]*$/.X/' - exit ;; - CRAY*TS:*:*:*) - echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' - exit ;; - CRAY*T3E:*:*:*) - echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' - exit ;; - CRAY*SV1:*:*:*) - echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' - exit ;; - *:UNICOS/mp:*:*) - echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' - exit ;; - F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) - FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` - FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` - FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` - echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" - exit ;; - 5000:UNIX_System_V:4.*:*) - FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` - FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` - echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" - exit ;; - i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) - echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} - exit ;; - sparc*:BSD/OS:*:*) - echo sparc-unknown-bsdi${UNAME_RELEASE} - exit ;; - *:BSD/OS:*:*) - echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE} - exit ;; - *:FreeBSD:*:*) - case ${UNAME_MACHINE} in - pc98) - echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; - amd64) - echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; - *) - echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; - esac - exit ;; - i*:CYGWIN*:*) - echo ${UNAME_MACHINE}-pc-cygwin - exit ;; - *:MINGW*:*) - echo ${UNAME_MACHINE}-pc-mingw32 - exit ;; - i*:windows32*:*) - # uname -m includes "-pc" on this system. - echo ${UNAME_MACHINE}-mingw32 - exit ;; - i*:PW*:*) - echo ${UNAME_MACHINE}-pc-pw32 - exit ;; - *:Interix*:*) - case ${UNAME_MACHINE} in - x86) - echo i586-pc-interix${UNAME_RELEASE} - exit ;; - authenticamd | genuineintel | EM64T) - echo x86_64-unknown-interix${UNAME_RELEASE} - exit ;; - IA64) - echo ia64-unknown-interix${UNAME_RELEASE} - exit ;; - esac ;; - [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) - echo i${UNAME_MACHINE}-pc-mks - exit ;; - 8664:Windows_NT:*) - echo x86_64-pc-mks - exit ;; - i*:Windows_NT*:* | Pentium*:Windows_NT*:*) - # How do we know it's Interix rather than the generic POSIX subsystem? - # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we - # UNAME_MACHINE based on the output of uname instead of i386? - echo i586-pc-interix - exit ;; - i*:UWIN*:*) - echo ${UNAME_MACHINE}-pc-uwin - exit ;; - amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) - echo x86_64-unknown-cygwin - exit ;; - p*:CYGWIN*:*) - echo powerpcle-unknown-cygwin - exit ;; - prep*:SunOS:5.*:*) - echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` - exit ;; - *:GNU:*:*) - # the GNU system - echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` - exit ;; - *:GNU/*:*:*) - # other systems with GNU libc and userland - echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu - exit ;; - i*86:Minix:*:*) - echo ${UNAME_MACHINE}-pc-minix - exit ;; - alpha:Linux:*:*) - case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in - EV5) UNAME_MACHINE=alphaev5 ;; - EV56) UNAME_MACHINE=alphaev56 ;; - PCA56) UNAME_MACHINE=alphapca56 ;; - PCA57) UNAME_MACHINE=alphapca56 ;; - EV6) UNAME_MACHINE=alphaev6 ;; - EV67) UNAME_MACHINE=alphaev67 ;; - EV68*) UNAME_MACHINE=alphaev68 ;; - esac - objdump --private-headers /bin/sh | grep -q ld.so.1 - if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi - echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} - exit ;; - arm*:Linux:*:*) - eval $set_cc_for_build - if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ - | grep -q __ARM_EABI__ - then - echo ${UNAME_MACHINE}-unknown-linux-gnu - else - echo ${UNAME_MACHINE}-unknown-linux-gnueabi - fi - exit ;; - avr32*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - cris:Linux:*:*) - echo cris-axis-linux-gnu - exit ;; - crisv32:Linux:*:*) - echo crisv32-axis-linux-gnu - exit ;; - frv:Linux:*:*) - echo frv-unknown-linux-gnu - exit ;; - i*86:Linux:*:*) - LIBC=gnu - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - #ifdef __dietlibc__ - LIBC=dietlibc - #endif -EOF - eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'` - echo "${UNAME_MACHINE}-pc-linux-${LIBC}" - exit ;; - ia64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - m32r*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - m68*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - mips:Linux:*:* | mips64:Linux:*:*) - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - #undef CPU - #undef ${UNAME_MACHINE} - #undef ${UNAME_MACHINE}el - #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) - CPU=${UNAME_MACHINE}el - #else - #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) - CPU=${UNAME_MACHINE} - #else - CPU= - #endif - #endif -EOF - eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'` - test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } - ;; - or32:Linux:*:*) - echo or32-unknown-linux-gnu - exit ;; - padre:Linux:*:*) - echo sparc-unknown-linux-gnu - exit ;; - parisc64:Linux:*:* | hppa64:Linux:*:*) - echo hppa64-unknown-linux-gnu - exit ;; - parisc:Linux:*:* | hppa:Linux:*:*) - # Look for CPU level - case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in - PA7*) echo hppa1.1-unknown-linux-gnu ;; - PA8*) echo hppa2.0-unknown-linux-gnu ;; - *) echo hppa-unknown-linux-gnu ;; - esac - exit ;; - ppc64:Linux:*:*) - echo powerpc64-unknown-linux-gnu - exit ;; - ppc:Linux:*:*) - echo powerpc-unknown-linux-gnu - exit ;; - s390:Linux:*:* | s390x:Linux:*:*) - echo ${UNAME_MACHINE}-ibm-linux - exit ;; - sh64*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - sh*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - sparc:Linux:*:* | sparc64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - tile*:Linux:*:*) - echo ${UNAME_MACHINE}-tilera-linux-gnu - exit ;; - vax:Linux:*:*) - echo ${UNAME_MACHINE}-dec-linux-gnu - exit ;; - x86_64:Linux:*:*) - echo x86_64-unknown-linux-gnu - exit ;; - xtensa*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu - exit ;; - i*86:DYNIX/ptx:4*:*) - # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. - # earlier versions are messed up and put the nodename in both - # sysname and nodename. - echo i386-sequent-sysv4 - exit ;; - i*86:UNIX_SV:4.2MP:2.*) - # Unixware is an offshoot of SVR4, but it has its own version - # number series starting with 2... - # I am not positive that other SVR4 systems won't match this, - # I just have to hope. -- rms. - # Use sysv4.2uw... so that sysv4* matches it. - echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} - exit ;; - i*86:OS/2:*:*) - # If we were able to find `uname', then EMX Unix compatibility - # is probably installed. - echo ${UNAME_MACHINE}-pc-os2-emx - exit ;; - i*86:XTS-300:*:STOP) - echo ${UNAME_MACHINE}-unknown-stop - exit ;; - i*86:atheos:*:*) - echo ${UNAME_MACHINE}-unknown-atheos - exit ;; - i*86:syllable:*:*) - echo ${UNAME_MACHINE}-pc-syllable - exit ;; - i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*) - echo i386-unknown-lynxos${UNAME_RELEASE} - exit ;; - i*86:*DOS:*:*) - echo ${UNAME_MACHINE}-pc-msdosdjgpp - exit ;; - i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) - UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` - if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then - echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL} - else - echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL} - fi - exit ;; - i*86:*:5:[678]*) - # UnixWare 7.x, OpenUNIX and OpenServer 6. - case `/bin/uname -X | grep "^Machine"` in - *486*) UNAME_MACHINE=i486 ;; - *Pentium) UNAME_MACHINE=i586 ;; - *Pent*|*Celeron) UNAME_MACHINE=i686 ;; - esac - echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} - exit ;; - i*86:*:3.2:*) - if test -f /usr/options/cb.name; then - UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then - UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` - (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 - (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ - && UNAME_MACHINE=i586 - (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ - && UNAME_MACHINE=i686 - (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ - && UNAME_MACHINE=i686 - echo ${UNAME_MACHINE}-pc-sco$UNAME_REL - else - echo ${UNAME_MACHINE}-pc-sysv32 - fi - exit ;; - pc:*:*:*) - # Left here for compatibility: - # uname -m prints for DJGPP always 'pc', but it prints nothing about - # the processor, so we play safe by assuming i586. - # Note: whatever this is, it MUST be the same as what config.sub - # prints for the "djgpp" host, or else GDB configury will decide that - # this is a cross-build. - echo i586-pc-msdosdjgpp - exit ;; - Intel:Mach:3*:*) - echo i386-pc-mach3 - exit ;; - paragon:*:*:*) - echo i860-intel-osf1 - exit ;; - i860:*:4.*:*) # i860-SVR4 - if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then - echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 - else # Add other i860-SVR4 vendors below as they are discovered. - echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4 - fi - exit ;; - mini*:CTIX:SYS*5:*) - # "miniframe" - echo m68010-convergent-sysv - exit ;; - mc68k:UNIX:SYSTEM5:3.51m) - echo m68k-convergent-sysv - exit ;; - M680?0:D-NIX:5.3:*) - echo m68k-diab-dnix - exit ;; - M68*:*:R3V[5678]*:*) - test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; - 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) - OS_REL='' - test -r /etc/.relid \ - && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` - /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ - && { echo i486-ncr-sysv4.3${OS_REL}; exit; } - /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ - && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; - 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) - /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ - && { echo i486-ncr-sysv4; exit; } ;; - NCR*:*:4.2:* | MPRAS*:*:4.2:*) - OS_REL='.3' - test -r /etc/.relid \ - && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` - /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ - && { echo i486-ncr-sysv4.3${OS_REL}; exit; } - /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ - && { echo i586-ncr-sysv4.3${OS_REL}; exit; } - /bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \ - && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; - m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) - echo m68k-unknown-lynxos${UNAME_RELEASE} - exit ;; - mc68030:UNIX_System_V:4.*:*) - echo m68k-atari-sysv4 - exit ;; - TSUNAMI:LynxOS:2.*:*) - echo sparc-unknown-lynxos${UNAME_RELEASE} - exit ;; - rs6000:LynxOS:2.*:*) - echo rs6000-unknown-lynxos${UNAME_RELEASE} - exit ;; - PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*) - echo powerpc-unknown-lynxos${UNAME_RELEASE} - exit ;; - SM[BE]S:UNIX_SV:*:*) - echo mips-dde-sysv${UNAME_RELEASE} - exit ;; - RM*:ReliantUNIX-*:*:*) - echo mips-sni-sysv4 - exit ;; - RM*:SINIX-*:*:*) - echo mips-sni-sysv4 - exit ;; - *:SINIX-*:*:*) - if uname -p 2>/dev/null >/dev/null ; then - UNAME_MACHINE=`(uname -p) 2>/dev/null` - echo ${UNAME_MACHINE}-sni-sysv4 - else - echo ns32k-sni-sysv - fi - exit ;; - PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort - # says - echo i586-unisys-sysv4 - exit ;; - *:UNIX_System_V:4*:FTX*) - # From Gerald Hewes . - # How about differentiating between stratus architectures? -djm - echo hppa1.1-stratus-sysv4 - exit ;; - *:*:*:FTX*) - # From seanf@swdc.stratus.com. - echo i860-stratus-sysv4 - exit ;; - i*86:VOS:*:*) - # From Paul.Green@stratus.com. - echo ${UNAME_MACHINE}-stratus-vos - exit ;; - *:VOS:*:*) - # From Paul.Green@stratus.com. - echo hppa1.1-stratus-vos - exit ;; - mc68*:A/UX:*:*) - echo m68k-apple-aux${UNAME_RELEASE} - exit ;; - news*:NEWS-OS:6*:*) - echo mips-sony-newsos6 - exit ;; - R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) - if [ -d /usr/nec ]; then - echo mips-nec-sysv${UNAME_RELEASE} - else - echo mips-unknown-sysv${UNAME_RELEASE} - fi - exit ;; - BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. - echo powerpc-be-beos - exit ;; - BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. - echo powerpc-apple-beos - exit ;; - BePC:BeOS:*:*) # BeOS running on Intel PC compatible. - echo i586-pc-beos - exit ;; - BePC:Haiku:*:*) # Haiku running on Intel PC compatible. - echo i586-pc-haiku - exit ;; - SX-4:SUPER-UX:*:*) - echo sx4-nec-superux${UNAME_RELEASE} - exit ;; - SX-5:SUPER-UX:*:*) - echo sx5-nec-superux${UNAME_RELEASE} - exit ;; - SX-6:SUPER-UX:*:*) - echo sx6-nec-superux${UNAME_RELEASE} - exit ;; - SX-7:SUPER-UX:*:*) - echo sx7-nec-superux${UNAME_RELEASE} - exit ;; - SX-8:SUPER-UX:*:*) - echo sx8-nec-superux${UNAME_RELEASE} - exit ;; - SX-8R:SUPER-UX:*:*) - echo sx8r-nec-superux${UNAME_RELEASE} - exit ;; - Power*:Rhapsody:*:*) - echo powerpc-apple-rhapsody${UNAME_RELEASE} - exit ;; - *:Rhapsody:*:*) - echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE} - exit ;; - *:Darwin:*:*) - UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown - case $UNAME_PROCESSOR in - i386) - eval $set_cc_for_build - if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then - if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ - (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ - grep IS_64BIT_ARCH >/dev/null - then - UNAME_PROCESSOR="x86_64" - fi - fi ;; - unknown) UNAME_PROCESSOR=powerpc ;; - esac - echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} - exit ;; - *:procnto*:*:* | *:QNX:[0123456789]*:*) - UNAME_PROCESSOR=`uname -p` - if test "$UNAME_PROCESSOR" = "x86"; then - UNAME_PROCESSOR=i386 - UNAME_MACHINE=pc - fi - echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE} - exit ;; - *:QNX:*:4*) - echo i386-pc-qnx - exit ;; - NSE-?:NONSTOP_KERNEL:*:*) - echo nse-tandem-nsk${UNAME_RELEASE} - exit ;; - NSR-?:NONSTOP_KERNEL:*:*) - echo nsr-tandem-nsk${UNAME_RELEASE} - exit ;; - *:NonStop-UX:*:*) - echo mips-compaq-nonstopux - exit ;; - BS2000:POSIX*:*:*) - echo bs2000-siemens-sysv - exit ;; - DS/*:UNIX_System_V:*:*) - echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE} - exit ;; - *:Plan9:*:*) - # "uname -m" is not consistent, so use $cputype instead. 386 - # is converted to i386 for consistency with other x86 - # operating systems. - if test "$cputype" = "386"; then - UNAME_MACHINE=i386 - else - UNAME_MACHINE="$cputype" - fi - echo ${UNAME_MACHINE}-unknown-plan9 - exit ;; - *:TOPS-10:*:*) - echo pdp10-unknown-tops10 - exit ;; - *:TENEX:*:*) - echo pdp10-unknown-tenex - exit ;; - KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) - echo pdp10-dec-tops20 - exit ;; - XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) - echo pdp10-xkl-tops20 - exit ;; - *:TOPS-20:*:*) - echo pdp10-unknown-tops20 - exit ;; - *:ITS:*:*) - echo pdp10-unknown-its - exit ;; - SEI:*:*:SEIUX) - echo mips-sei-seiux${UNAME_RELEASE} - exit ;; - *:DragonFly:*:*) - echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` - exit ;; - *:*VMS:*:*) - UNAME_MACHINE=`(uname -p) 2>/dev/null` - case "${UNAME_MACHINE}" in - A*) echo alpha-dec-vms ; exit ;; - I*) echo ia64-dec-vms ; exit ;; - V*) echo vax-dec-vms ; exit ;; - esac ;; - *:XENIX:*:SysV) - echo i386-pc-xenix - exit ;; - i*86:skyos:*:*) - echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//' - exit ;; - i*86:rdos:*:*) - echo ${UNAME_MACHINE}-pc-rdos - exit ;; - i*86:AROS:*:*) - echo ${UNAME_MACHINE}-pc-aros - exit ;; -esac - -#echo '(No uname command or uname output not recognized.)' 1>&2 -#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2 - -eval $set_cc_for_build -cat >$dummy.c < -# include -#endif -main () -{ -#if defined (sony) -#if defined (MIPSEB) - /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, - I don't know.... */ - printf ("mips-sony-bsd\n"); exit (0); -#else -#include - printf ("m68k-sony-newsos%s\n", -#ifdef NEWSOS4 - "4" -#else - "" -#endif - ); exit (0); -#endif -#endif - -#if defined (__arm) && defined (__acorn) && defined (__unix) - printf ("arm-acorn-riscix\n"); exit (0); -#endif - -#if defined (hp300) && !defined (hpux) - printf ("m68k-hp-bsd\n"); exit (0); -#endif - -#if defined (NeXT) -#if !defined (__ARCHITECTURE__) -#define __ARCHITECTURE__ "m68k" -#endif - int version; - version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; - if (version < 4) - printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); - else - printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); - exit (0); -#endif - -#if defined (MULTIMAX) || defined (n16) -#if defined (UMAXV) - printf ("ns32k-encore-sysv\n"); exit (0); -#else -#if defined (CMU) - printf ("ns32k-encore-mach\n"); exit (0); -#else - printf ("ns32k-encore-bsd\n"); exit (0); -#endif -#endif -#endif - -#if defined (__386BSD__) - printf ("i386-pc-bsd\n"); exit (0); -#endif - -#if defined (sequent) -#if defined (i386) - printf ("i386-sequent-dynix\n"); exit (0); -#endif -#if defined (ns32000) - printf ("ns32k-sequent-dynix\n"); exit (0); -#endif -#endif - -#if defined (_SEQUENT_) - struct utsname un; - - uname(&un); - - if (strncmp(un.version, "V2", 2) == 0) { - printf ("i386-sequent-ptx2\n"); exit (0); - } - if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ - printf ("i386-sequent-ptx1\n"); exit (0); - } - printf ("i386-sequent-ptx\n"); exit (0); - -#endif - -#if defined (vax) -# if !defined (ultrix) -# include -# if defined (BSD) -# if BSD == 43 - printf ("vax-dec-bsd4.3\n"); exit (0); -# else -# if BSD == 199006 - printf ("vax-dec-bsd4.3reno\n"); exit (0); -# else - printf ("vax-dec-bsd\n"); exit (0); -# endif -# endif -# else - printf ("vax-dec-bsd\n"); exit (0); -# endif -# else - printf ("vax-dec-ultrix\n"); exit (0); -# endif -#endif - -#if defined (alliant) && defined (i860) - printf ("i860-alliant-bsd\n"); exit (0); -#endif - - exit (1); -} -EOF - -$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` && - { echo "$SYSTEM_NAME"; exit; } - -# Apollos put the system type in the environment. - -test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; } - -# Convex versions that predate uname can use getsysinfo(1) - -if [ -x /usr/convex/getsysinfo ] -then - case `getsysinfo -f cpu_type` in - c1*) - echo c1-convex-bsd - exit ;; - c2*) - if getsysinfo -f scalar_acc - then echo c32-convex-bsd - else echo c2-convex-bsd - fi - exit ;; - c34*) - echo c34-convex-bsd - exit ;; - c38*) - echo c38-convex-bsd - exit ;; - c4*) - echo c4-convex-bsd - exit ;; - esac -fi - -cat >&2 < in order to provide the needed -information to handle your system. - -config.guess timestamp = $timestamp - -uname -m = `(uname -m) 2>/dev/null || echo unknown` -uname -r = `(uname -r) 2>/dev/null || echo unknown` -uname -s = `(uname -s) 2>/dev/null || echo unknown` -uname -v = `(uname -v) 2>/dev/null || echo unknown` - -/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` -/bin/uname -X = `(/bin/uname -X) 2>/dev/null` - -hostinfo = `(hostinfo) 2>/dev/null` -/bin/universe = `(/bin/universe) 2>/dev/null` -/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` -/bin/arch = `(/bin/arch) 2>/dev/null` -/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` -/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` - -UNAME_MACHINE = ${UNAME_MACHINE} -UNAME_RELEASE = ${UNAME_RELEASE} -UNAME_SYSTEM = ${UNAME_SYSTEM} -UNAME_VERSION = ${UNAME_VERSION} -EOF - -exit 1 - -# Local variables: -# eval: (add-hook 'write-file-hooks 'time-stamp) -# time-stamp-start: "timestamp='" -# time-stamp-format: "%:y-%02m-%02d" -# time-stamp-end: "'" -# End: diff --git a/volk/config.guess b/volk/config.guess new file mode 120000 index 000000000..405bc3235 --- /dev/null +++ b/volk/config.guess @@ -0,0 +1 @@ +/usr/share/automake-1.11/config.guess \ No newline at end of file diff --git a/volk/config.sub b/volk/config.sub deleted file mode 100755 index 320e30388..000000000 --- a/volk/config.sub +++ /dev/null @@ -1,1739 +0,0 @@ -#! /bin/sh -# Configuration validation subroutine script. -# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, -# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 -# Free Software Foundation, Inc. - -timestamp='2010-09-11' - -# This file is (in principle) common to ALL GNU software. -# The presence of a machine in this file suggests that SOME GNU software -# can handle that machine. It does not imply ALL GNU software can. -# -# This file is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA -# 02110-1301, USA. -# -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - - -# Please send patches to . Submit a context -# diff and a properly formatted GNU ChangeLog entry. -# -# Configuration subroutine to validate and canonicalize a configuration type. -# Supply the specified configuration type as an argument. -# If it is invalid, we print an error message on stderr and exit with code 1. -# Otherwise, we print the canonical config type on stdout and succeed. - -# You can get the latest version of this script from: -# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD - -# This file is supposed to be the same for all GNU packages -# and recognize all the CPU types, system types and aliases -# that are meaningful with *any* GNU software. -# Each package is responsible for reporting which valid configurations -# it does not support. The user should be able to distinguish -# a failure to support a valid configuration from a meaningless -# configuration. - -# The goal of this file is to map all the various variations of a given -# machine specification into a single specification in the form: -# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM -# or in some cases, the newer four-part form: -# CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM -# It is wrong to echo any other type of specification. - -me=`echo "$0" | sed -e 's,.*/,,'` - -usage="\ -Usage: $0 [OPTION] CPU-MFR-OPSYS - $0 [OPTION] ALIAS - -Canonicalize a configuration name. - -Operation modes: - -h, --help print this help, then exit - -t, --time-stamp print date of last modification, then exit - -v, --version print version number, then exit - -Report bugs and patches to ." - -version="\ -GNU config.sub ($timestamp) - -Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, -2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free -Software Foundation, Inc. - -This is free software; see the source for copying conditions. There is NO -warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." - -help=" -Try \`$me --help' for more information." - -# Parse command line -while test $# -gt 0 ; do - case $1 in - --time-stamp | --time* | -t ) - echo "$timestamp" ; exit ;; - --version | -v ) - echo "$version" ; exit ;; - --help | --h* | -h ) - echo "$usage"; exit ;; - -- ) # Stop option processing - shift; break ;; - - ) # Use stdin as input. - break ;; - -* ) - echo "$me: invalid option $1$help" - exit 1 ;; - - *local*) - # First pass through any local machine types. - echo $1 - exit ;; - - * ) - break ;; - esac -done - -case $# in - 0) echo "$me: missing argument$help" >&2 - exit 1;; - 1) ;; - *) echo "$me: too many arguments$help" >&2 - exit 1;; -esac - -# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any). -# Here we must recognize all the valid KERNEL-OS combinations. -maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` -case $maybe_os in - nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \ - linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \ - knetbsd*-gnu* | netbsd*-gnu* | \ - kopensolaris*-gnu* | \ - storm-chaos* | os2-emx* | rtmk-nova*) - os=-$maybe_os - basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` - ;; - *) - basic_machine=`echo $1 | sed 's/-[^-]*$//'` - if [ $basic_machine != $1 ] - then os=`echo $1 | sed 's/.*-/-/'` - else os=; fi - ;; -esac - -### Let's recognize common machines as not being operating systems so -### that things like config.sub decstation-3100 work. We also -### recognize some manufacturers as not being operating systems, so we -### can provide default operating systems below. -case $os in - -sun*os*) - # Prevent following clause from handling this invalid input. - ;; - -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \ - -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \ - -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \ - -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ - -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ - -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ - -apple | -axis | -knuth | -cray | -microblaze) - os= - basic_machine=$1 - ;; - -bluegene*) - os=-cnk - ;; - -sim | -cisco | -oki | -wec | -winbond) - os= - basic_machine=$1 - ;; - -scout) - ;; - -wrs) - os=-vxworks - basic_machine=$1 - ;; - -chorusos*) - os=-chorusos - basic_machine=$1 - ;; - -chorusrdb) - os=-chorusrdb - basic_machine=$1 - ;; - -hiux*) - os=-hiuxwe2 - ;; - -sco6) - os=-sco5v6 - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` - ;; - -sco5) - os=-sco3.2v5 - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` - ;; - -sco4) - os=-sco3.2v4 - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` - ;; - -sco3.2.[4-9]*) - os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` - ;; - -sco3.2v[4-9]*) - # Don't forget version if it is 3.2v4 or newer. - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` - ;; - -sco5v6*) - # Don't forget version if it is 3.2v4 or newer. - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` - ;; - -sco*) - os=-sco3.2v2 - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` - ;; - -udk*) - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` - ;; - -isc) - os=-isc2.2 - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` - ;; - -clix*) - basic_machine=clipper-intergraph - ;; - -isc*) - basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` - ;; - -lynx*) - os=-lynxos - ;; - -ptx*) - basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'` - ;; - -windowsnt*) - os=`echo $os | sed -e 's/windowsnt/winnt/'` - ;; - -psos*) - os=-psos - ;; - -mint | -mint[0-9]*) - basic_machine=m68k-atari - os=-mint - ;; -esac - -# Decode aliases for certain CPU-COMPANY combinations. -case $basic_machine in - # Recognize the basic CPU types without company name. - # Some are omitted here because they have special meanings below. - 1750a | 580 \ - | a29k \ - | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \ - | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \ - | am33_2.0 \ - | arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \ - | bfin \ - | c4x | clipper \ - | d10v | d30v | dlx | dsp16xx \ - | fido | fr30 | frv \ - | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ - | i370 | i860 | i960 | ia64 \ - | ip2k | iq2000 \ - | lm32 \ - | m32c | m32r | m32rle | m68000 | m68k | m88k \ - | maxq | mb | microblaze | mcore | mep | metag \ - | mips | mipsbe | mipseb | mipsel | mipsle \ - | mips16 \ - | mips64 | mips64el \ - | mips64octeon | mips64octeonel \ - | mips64orion | mips64orionel \ - | mips64r5900 | mips64r5900el \ - | mips64vr | mips64vrel \ - | mips64vr4100 | mips64vr4100el \ - | mips64vr4300 | mips64vr4300el \ - | mips64vr5000 | mips64vr5000el \ - | mips64vr5900 | mips64vr5900el \ - | mipsisa32 | mipsisa32el \ - | mipsisa32r2 | mipsisa32r2el \ - | mipsisa64 | mipsisa64el \ - | mipsisa64r2 | mipsisa64r2el \ - | mipsisa64sb1 | mipsisa64sb1el \ - | mipsisa64sr71k | mipsisa64sr71kel \ - | mipstx39 | mipstx39el \ - | mn10200 | mn10300 \ - | moxie \ - | mt \ - | msp430 \ - | nds32 | nds32le | nds32be \ - | nios | nios2 \ - | ns16k | ns32k \ - | or32 \ - | pdp10 | pdp11 | pj | pjl \ - | powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \ - | pyramid \ - | rx \ - | score \ - | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \ - | sh64 | sh64le \ - | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \ - | sparcv8 | sparcv9 | sparcv9b | sparcv9v \ - | spu | strongarm \ - | tahoe | thumb | tic4x | tic54x | tic55x | tic6x | tic80 | tron \ - | ubicom32 \ - | v850 | v850e \ - | we32k \ - | x86 | xc16x | xscale | xscalee[bl] | xstormy16 | xtensa \ - | z8k | z80) - basic_machine=$basic_machine-unknown - ;; - c54x) - basic_machine=tic54x-unknown - ;; - c55x) - basic_machine=tic55x-unknown - ;; - c6x) - basic_machine=tic6x-unknown - ;; - m6811 | m68hc11 | m6812 | m68hc12 | picochip) - # Motorola 68HC11/12. - basic_machine=$basic_machine-unknown - os=-none - ;; - m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k) - ;; - ms1) - basic_machine=mt-unknown - ;; - - # We use `pc' rather than `unknown' - # because (1) that's what they normally are, and - # (2) the word "unknown" tends to confuse beginning users. - i*86 | x86_64) - basic_machine=$basic_machine-pc - ;; - # Object if more than one company name word. - *-*-*) - echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 - exit 1 - ;; - # Recognize the basic CPU types with company name. - 580-* \ - | a29k-* \ - | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \ - | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \ - | alphapca5[67]-* | alpha64pca5[67]-* | arc-* \ - | arm-* | armbe-* | armle-* | armeb-* | armv*-* \ - | avr-* | avr32-* \ - | bfin-* | bs2000-* \ - | c[123]* | c30-* | [cjt]90-* | c4x-* \ - | clipper-* | craynv-* | cydra-* \ - | d10v-* | d30v-* | dlx-* \ - | elxsi-* \ - | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \ - | h8300-* | h8500-* \ - | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \ - | i*86-* | i860-* | i960-* | ia64-* \ - | ip2k-* | iq2000-* \ - | lm32-* \ - | m32c-* | m32r-* | m32rle-* \ - | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \ - | m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \ - | mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \ - | mips16-* \ - | mips64-* | mips64el-* \ - | mips64octeon-* | mips64octeonel-* \ - | mips64orion-* | mips64orionel-* \ - | mips64r5900-* | mips64r5900el-* \ - | mips64vr-* | mips64vrel-* \ - | mips64vr4100-* | mips64vr4100el-* \ - | mips64vr4300-* | mips64vr4300el-* \ - | mips64vr5000-* | mips64vr5000el-* \ - | mips64vr5900-* | mips64vr5900el-* \ - | mipsisa32-* | mipsisa32el-* \ - | mipsisa32r2-* | mipsisa32r2el-* \ - | mipsisa64-* | mipsisa64el-* \ - | mipsisa64r2-* | mipsisa64r2el-* \ - | mipsisa64sb1-* | mipsisa64sb1el-* \ - | mipsisa64sr71k-* | mipsisa64sr71kel-* \ - | mipstx39-* | mipstx39el-* \ - | mmix-* \ - | mt-* \ - | msp430-* \ - | nds32-* | nds32le-* | nds32be-* \ - | nios-* | nios2-* \ - | none-* | np1-* | ns16k-* | ns32k-* \ - | orion-* \ - | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \ - | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \ - | pyramid-* \ - | romp-* | rs6000-* | rx-* \ - | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \ - | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \ - | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \ - | sparclite-* \ - | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | strongarm-* | sv1-* | sx?-* \ - | tahoe-* | thumb-* \ - | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \ - | tile-* | tilegx-* \ - | tron-* \ - | ubicom32-* \ - | v850-* | v850e-* | vax-* \ - | we32k-* \ - | x86-* | x86_64-* | xc16x-* | xps100-* | xscale-* | xscalee[bl]-* \ - | xstormy16-* | xtensa*-* \ - | ymp-* \ - | z8k-* | z80-*) - ;; - # Recognize the basic CPU types without company name, with glob match. - xtensa*) - basic_machine=$basic_machine-unknown - ;; - # Recognize the various machine names and aliases which stand - # for a CPU type and a company and sometimes even an OS. - 386bsd) - basic_machine=i386-unknown - os=-bsd - ;; - 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) - basic_machine=m68000-att - ;; - 3b*) - basic_machine=we32k-att - ;; - a29khif) - basic_machine=a29k-amd - os=-udi - ;; - abacus) - basic_machine=abacus-unknown - ;; - adobe68k) - basic_machine=m68010-adobe - os=-scout - ;; - alliant | fx80) - basic_machine=fx80-alliant - ;; - altos | altos3068) - basic_machine=m68k-altos - ;; - am29k) - basic_machine=a29k-none - os=-bsd - ;; - amd64) - basic_machine=x86_64-pc - ;; - amd64-*) - basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'` - ;; - amdahl) - basic_machine=580-amdahl - os=-sysv - ;; - amiga | amiga-*) - basic_machine=m68k-unknown - ;; - amigaos | amigados) - basic_machine=m68k-unknown - os=-amigaos - ;; - amigaunix | amix) - basic_machine=m68k-unknown - os=-sysv4 - ;; - apollo68) - basic_machine=m68k-apollo - os=-sysv - ;; - apollo68bsd) - basic_machine=m68k-apollo - os=-bsd - ;; - aros) - basic_machine=i386-pc - os=-aros - ;; - aux) - basic_machine=m68k-apple - os=-aux - ;; - balance) - basic_machine=ns32k-sequent - os=-dynix - ;; - blackfin) - basic_machine=bfin-unknown - os=-linux - ;; - blackfin-*) - basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'` - os=-linux - ;; - bluegene*) - basic_machine=powerpc-ibm - os=-cnk - ;; - c54x-*) - basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'` - ;; - c55x-*) - basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'` - ;; - c6x-*) - basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'` - ;; - c90) - basic_machine=c90-cray - os=-unicos - ;; - cegcc) - basic_machine=arm-unknown - os=-cegcc - ;; - convex-c1) - basic_machine=c1-convex - os=-bsd - ;; - convex-c2) - basic_machine=c2-convex - os=-bsd - ;; - convex-c32) - basic_machine=c32-convex - os=-bsd - ;; - convex-c34) - basic_machine=c34-convex - os=-bsd - ;; - convex-c38) - basic_machine=c38-convex - os=-bsd - ;; - cray | j90) - basic_machine=j90-cray - os=-unicos - ;; - craynv) - basic_machine=craynv-cray - os=-unicosmp - ;; - cr16) - basic_machine=cr16-unknown - os=-elf - ;; - crds | unos) - basic_machine=m68k-crds - ;; - crisv32 | crisv32-* | etraxfs*) - basic_machine=crisv32-axis - ;; - cris | cris-* | etrax*) - basic_machine=cris-axis - ;; - crx) - basic_machine=crx-unknown - os=-elf - ;; - da30 | da30-*) - basic_machine=m68k-da30 - ;; - decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn) - basic_machine=mips-dec - ;; - decsystem10* | dec10*) - basic_machine=pdp10-dec - os=-tops10 - ;; - decsystem20* | dec20*) - basic_machine=pdp10-dec - os=-tops20 - ;; - delta | 3300 | motorola-3300 | motorola-delta \ - | 3300-motorola | delta-motorola) - basic_machine=m68k-motorola - ;; - delta88) - basic_machine=m88k-motorola - os=-sysv3 - ;; - dicos) - basic_machine=i686-pc - os=-dicos - ;; - djgpp) - basic_machine=i586-pc - os=-msdosdjgpp - ;; - dpx20 | dpx20-*) - basic_machine=rs6000-bull - os=-bosx - ;; - dpx2* | dpx2*-bull) - basic_machine=m68k-bull - os=-sysv3 - ;; - ebmon29k) - basic_machine=a29k-amd - os=-ebmon - ;; - elxsi) - basic_machine=elxsi-elxsi - os=-bsd - ;; - encore | umax | mmax) - basic_machine=ns32k-encore - ;; - es1800 | OSE68k | ose68k | ose | OSE) - basic_machine=m68k-ericsson - os=-ose - ;; - fx2800) - basic_machine=i860-alliant - ;; - genix) - basic_machine=ns32k-ns - ;; - gmicro) - basic_machine=tron-gmicro - os=-sysv - ;; - go32) - basic_machine=i386-pc - os=-go32 - ;; - h3050r* | hiux*) - basic_machine=hppa1.1-hitachi - os=-hiuxwe2 - ;; - h8300hms) - basic_machine=h8300-hitachi - os=-hms - ;; - h8300xray) - basic_machine=h8300-hitachi - os=-xray - ;; - h8500hms) - basic_machine=h8500-hitachi - os=-hms - ;; - harris) - basic_machine=m88k-harris - os=-sysv3 - ;; - hp300-*) - basic_machine=m68k-hp - ;; - hp300bsd) - basic_machine=m68k-hp - os=-bsd - ;; - hp300hpux) - basic_machine=m68k-hp - os=-hpux - ;; - hp3k9[0-9][0-9] | hp9[0-9][0-9]) - basic_machine=hppa1.0-hp - ;; - hp9k2[0-9][0-9] | hp9k31[0-9]) - basic_machine=m68000-hp - ;; - hp9k3[2-9][0-9]) - basic_machine=m68k-hp - ;; - hp9k6[0-9][0-9] | hp6[0-9][0-9]) - basic_machine=hppa1.0-hp - ;; - hp9k7[0-79][0-9] | hp7[0-79][0-9]) - basic_machine=hppa1.1-hp - ;; - hp9k78[0-9] | hp78[0-9]) - # FIXME: really hppa2.0-hp - basic_machine=hppa1.1-hp - ;; - hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893) - # FIXME: really hppa2.0-hp - basic_machine=hppa1.1-hp - ;; - hp9k8[0-9][13679] | hp8[0-9][13679]) - basic_machine=hppa1.1-hp - ;; - hp9k8[0-9][0-9] | hp8[0-9][0-9]) - basic_machine=hppa1.0-hp - ;; - hppa-next) - os=-nextstep3 - ;; - hppaosf) - basic_machine=hppa1.1-hp - os=-osf - ;; - hppro) - basic_machine=hppa1.1-hp - os=-proelf - ;; - i370-ibm* | ibm*) - basic_machine=i370-ibm - ;; -# I'm not sure what "Sysv32" means. Should this be sysv3.2? - i*86v32) - basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` - os=-sysv32 - ;; - i*86v4*) - basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` - os=-sysv4 - ;; - i*86v) - basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` - os=-sysv - ;; - i*86sol2) - basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` - os=-solaris2 - ;; - i386mach) - basic_machine=i386-mach - os=-mach - ;; - i386-vsta | vsta) - basic_machine=i386-unknown - os=-vsta - ;; - iris | iris4d) - basic_machine=mips-sgi - case $os in - -irix*) - ;; - *) - os=-irix4 - ;; - esac - ;; - isi68 | isi) - basic_machine=m68k-isi - os=-sysv - ;; - m68knommu) - basic_machine=m68k-unknown - os=-linux - ;; - m68knommu-*) - basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'` - os=-linux - ;; - m88k-omron*) - basic_machine=m88k-omron - ;; - magnum | m3230) - basic_machine=mips-mips - os=-sysv - ;; - merlin) - basic_machine=ns32k-utek - os=-sysv - ;; - microblaze) - basic_machine=microblaze-xilinx - ;; - mingw32) - basic_machine=i386-pc - os=-mingw32 - ;; - mingw32ce) - basic_machine=arm-unknown - os=-mingw32ce - ;; - miniframe) - basic_machine=m68000-convergent - ;; - *mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*) - basic_machine=m68k-atari - os=-mint - ;; - mips3*-*) - basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'` - ;; - mips3*) - basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown - ;; - monitor) - basic_machine=m68k-rom68k - os=-coff - ;; - morphos) - basic_machine=powerpc-unknown - os=-morphos - ;; - msdos) - basic_machine=i386-pc - os=-msdos - ;; - ms1-*) - basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'` - ;; - mvs) - basic_machine=i370-ibm - os=-mvs - ;; - ncr3000) - basic_machine=i486-ncr - os=-sysv4 - ;; - netbsd386) - basic_machine=i386-unknown - os=-netbsd - ;; - netwinder) - basic_machine=armv4l-rebel - os=-linux - ;; - news | news700 | news800 | news900) - basic_machine=m68k-sony - os=-newsos - ;; - news1000) - basic_machine=m68030-sony - os=-newsos - ;; - news-3600 | risc-news) - basic_machine=mips-sony - os=-newsos - ;; - necv70) - basic_machine=v70-nec - os=-sysv - ;; - next | m*-next ) - basic_machine=m68k-next - case $os in - -nextstep* ) - ;; - -ns2*) - os=-nextstep2 - ;; - *) - os=-nextstep3 - ;; - esac - ;; - nh3000) - basic_machine=m68k-harris - os=-cxux - ;; - nh[45]000) - basic_machine=m88k-harris - os=-cxux - ;; - nindy960) - basic_machine=i960-intel - os=-nindy - ;; - mon960) - basic_machine=i960-intel - os=-mon960 - ;; - nonstopux) - basic_machine=mips-compaq - os=-nonstopux - ;; - np1) - basic_machine=np1-gould - ;; - neo-tandem) - basic_machine=neo-tandem - ;; - nse-tandem) - basic_machine=nse-tandem - ;; - nsr-tandem) - basic_machine=nsr-tandem - ;; - op50n-* | op60c-*) - basic_machine=hppa1.1-oki - os=-proelf - ;; - openrisc | openrisc-*) - basic_machine=or32-unknown - ;; - os400) - basic_machine=powerpc-ibm - os=-os400 - ;; - OSE68000 | ose68000) - basic_machine=m68000-ericsson - os=-ose - ;; - os68k) - basic_machine=m68k-none - os=-os68k - ;; - pa-hitachi) - basic_machine=hppa1.1-hitachi - os=-hiuxwe2 - ;; - paragon) - basic_machine=i860-intel - os=-osf - ;; - parisc) - basic_machine=hppa-unknown - os=-linux - ;; - parisc-*) - basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'` - os=-linux - ;; - pbd) - basic_machine=sparc-tti - ;; - pbb) - basic_machine=m68k-tti - ;; - pc532 | pc532-*) - basic_machine=ns32k-pc532 - ;; - pc98) - basic_machine=i386-pc - ;; - pc98-*) - basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'` - ;; - pentium | p5 | k5 | k6 | nexgen | viac3) - basic_machine=i586-pc - ;; - pentiumpro | p6 | 6x86 | athlon | athlon_*) - basic_machine=i686-pc - ;; - pentiumii | pentium2 | pentiumiii | pentium3) - basic_machine=i686-pc - ;; - pentium4) - basic_machine=i786-pc - ;; - pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*) - basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'` - ;; - pentiumpro-* | p6-* | 6x86-* | athlon-*) - basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` - ;; - pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*) - basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` - ;; - pentium4-*) - basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'` - ;; - pn) - basic_machine=pn-gould - ;; - power) basic_machine=power-ibm - ;; - ppc) basic_machine=powerpc-unknown - ;; - ppc-*) basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'` - ;; - ppcle | powerpclittle | ppc-le | powerpc-little) - basic_machine=powerpcle-unknown - ;; - ppcle-* | powerpclittle-*) - basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'` - ;; - ppc64) basic_machine=powerpc64-unknown - ;; - ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'` - ;; - ppc64le | powerpc64little | ppc64-le | powerpc64-little) - basic_machine=powerpc64le-unknown - ;; - ppc64le-* | powerpc64little-*) - basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'` - ;; - ps2) - basic_machine=i386-ibm - ;; - pw32) - basic_machine=i586-unknown - os=-pw32 - ;; - rdos) - basic_machine=i386-pc - os=-rdos - ;; - rom68k) - basic_machine=m68k-rom68k - os=-coff - ;; - rm[46]00) - basic_machine=mips-siemens - ;; - rtpc | rtpc-*) - basic_machine=romp-ibm - ;; - s390 | s390-*) - basic_machine=s390-ibm - ;; - s390x | s390x-*) - basic_machine=s390x-ibm - ;; - sa29200) - basic_machine=a29k-amd - os=-udi - ;; - sb1) - basic_machine=mipsisa64sb1-unknown - ;; - sb1el) - basic_machine=mipsisa64sb1el-unknown - ;; - sde) - basic_machine=mipsisa32-sde - os=-elf - ;; - sei) - basic_machine=mips-sei - os=-seiux - ;; - sequent) - basic_machine=i386-sequent - ;; - sh) - basic_machine=sh-hitachi - os=-hms - ;; - sh5el) - basic_machine=sh5le-unknown - ;; - sh64) - basic_machine=sh64-unknown - ;; - sparclite-wrs | simso-wrs) - basic_machine=sparclite-wrs - os=-vxworks - ;; - sps7) - basic_machine=m68k-bull - os=-sysv2 - ;; - spur) - basic_machine=spur-unknown - ;; - st2000) - basic_machine=m68k-tandem - ;; - stratus) - basic_machine=i860-stratus - os=-sysv4 - ;; - sun2) - basic_machine=m68000-sun - ;; - sun2os3) - basic_machine=m68000-sun - os=-sunos3 - ;; - sun2os4) - basic_machine=m68000-sun - os=-sunos4 - ;; - sun3os3) - basic_machine=m68k-sun - os=-sunos3 - ;; - sun3os4) - basic_machine=m68k-sun - os=-sunos4 - ;; - sun4os3) - basic_machine=sparc-sun - os=-sunos3 - ;; - sun4os4) - basic_machine=sparc-sun - os=-sunos4 - ;; - sun4sol2) - basic_machine=sparc-sun - os=-solaris2 - ;; - sun3 | sun3-*) - basic_machine=m68k-sun - ;; - sun4) - basic_machine=sparc-sun - ;; - sun386 | sun386i | roadrunner) - basic_machine=i386-sun - ;; - sv1) - basic_machine=sv1-cray - os=-unicos - ;; - symmetry) - basic_machine=i386-sequent - os=-dynix - ;; - t3e) - basic_machine=alphaev5-cray - os=-unicos - ;; - t90) - basic_machine=t90-cray - os=-unicos - ;; - # This must be matched before tile*. - tilegx*) - basic_machine=tilegx-unknown - os=-linux-gnu - ;; - tile*) - basic_machine=tile-unknown - os=-linux-gnu - ;; - tx39) - basic_machine=mipstx39-unknown - ;; - tx39el) - basic_machine=mipstx39el-unknown - ;; - toad1) - basic_machine=pdp10-xkl - os=-tops20 - ;; - tower | tower-32) - basic_machine=m68k-ncr - ;; - tpf) - basic_machine=s390x-ibm - os=-tpf - ;; - udi29k) - basic_machine=a29k-amd - os=-udi - ;; - ultra3) - basic_machine=a29k-nyu - os=-sym1 - ;; - v810 | necv810) - basic_machine=v810-nec - os=-none - ;; - vaxv) - basic_machine=vax-dec - os=-sysv - ;; - vms) - basic_machine=vax-dec - os=-vms - ;; - vpp*|vx|vx-*) - basic_machine=f301-fujitsu - ;; - vxworks960) - basic_machine=i960-wrs - os=-vxworks - ;; - vxworks68) - basic_machine=m68k-wrs - os=-vxworks - ;; - vxworks29k) - basic_machine=a29k-wrs - os=-vxworks - ;; - w65*) - basic_machine=w65-wdc - os=-none - ;; - w89k-*) - basic_machine=hppa1.1-winbond - os=-proelf - ;; - xbox) - basic_machine=i686-pc - os=-mingw32 - ;; - xps | xps100) - basic_machine=xps100-honeywell - ;; - ymp) - basic_machine=ymp-cray - os=-unicos - ;; - z8k-*-coff) - basic_machine=z8k-unknown - os=-sim - ;; - z80-*-coff) - basic_machine=z80-unknown - os=-sim - ;; - none) - basic_machine=none-none - os=-none - ;; - -# Here we handle the default manufacturer of certain CPU types. It is in -# some cases the only manufacturer, in others, it is the most popular. - w89k) - basic_machine=hppa1.1-winbond - ;; - op50n) - basic_machine=hppa1.1-oki - ;; - op60c) - basic_machine=hppa1.1-oki - ;; - romp) - basic_machine=romp-ibm - ;; - mmix) - basic_machine=mmix-knuth - ;; - rs6000) - basic_machine=rs6000-ibm - ;; - vax) - basic_machine=vax-dec - ;; - pdp10) - # there are many clones, so DEC is not a safe bet - basic_machine=pdp10-unknown - ;; - pdp11) - basic_machine=pdp11-dec - ;; - we32k) - basic_machine=we32k-att - ;; - sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele) - basic_machine=sh-unknown - ;; - sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v) - basic_machine=sparc-sun - ;; - cydra) - basic_machine=cydra-cydrome - ;; - orion) - basic_machine=orion-highlevel - ;; - orion105) - basic_machine=clipper-highlevel - ;; - mac | mpw | mac-mpw) - basic_machine=m68k-apple - ;; - pmac | pmac-mpw) - basic_machine=powerpc-apple - ;; - *-unknown) - # Make sure to match an already-canonicalized machine name. - ;; - *) - echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 - exit 1 - ;; -esac - -# Here we canonicalize certain aliases for manufacturers. -case $basic_machine in - *-digital*) - basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'` - ;; - *-commodore*) - basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'` - ;; - *) - ;; -esac - -# Decode manufacturer-specific aliases for certain operating systems. - -if [ x"$os" != x"" ] -then -case $os in - # First match some system type aliases - # that might get confused with valid system types. - # -solaris* is a basic system type, with this one exception. - -auroraux) - os=-auroraux - ;; - -solaris1 | -solaris1.*) - os=`echo $os | sed -e 's|solaris1|sunos4|'` - ;; - -solaris) - os=-solaris2 - ;; - -svr4*) - os=-sysv4 - ;; - -unixware*) - os=-sysv4.2uw - ;; - -gnu/linux*) - os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` - ;; - # First accept the basic system types. - # The portable systems comes first. - # Each alternative MUST END IN A *, to match a version number. - # -sysv* is not here because it comes later, after sysvr4. - -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ - | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ - | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ - | -sym* | -kopensolaris* \ - | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ - | -aos* | -aros* \ - | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ - | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ - | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \ - | -openbsd* | -solidbsd* \ - | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \ - | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ - | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ - | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \ - | -chorusos* | -chorusrdb* | -cegcc* \ - | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ - | -mingw32* | -linux-gnu* | -linux-android* \ - | -linux-newlib* | -linux-uclibc* \ - | -uxpv* | -beos* | -mpeix* | -udk* \ - | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \ - | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ - | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \ - | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \ - | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \ - | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \ - | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*) - # Remember, each alternative MUST END IN *, to match a version number. - ;; - -qnx*) - case $basic_machine in - x86-* | i*86-*) - ;; - *) - os=-nto$os - ;; - esac - ;; - -nto-qnx*) - ;; - -nto*) - os=`echo $os | sed -e 's|nto|nto-qnx|'` - ;; - -sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \ - | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \ - | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*) - ;; - -mac*) - os=`echo $os | sed -e 's|mac|macos|'` - ;; - -linux-dietlibc) - os=-linux-dietlibc - ;; - -linux*) - os=`echo $os | sed -e 's|linux|linux-gnu|'` - ;; - -sunos5*) - os=`echo $os | sed -e 's|sunos5|solaris2|'` - ;; - -sunos6*) - os=`echo $os | sed -e 's|sunos6|solaris3|'` - ;; - -opened*) - os=-openedition - ;; - -os400*) - os=-os400 - ;; - -wince*) - os=-wince - ;; - -osfrose*) - os=-osfrose - ;; - -osf*) - os=-osf - ;; - -utek*) - os=-bsd - ;; - -dynix*) - os=-bsd - ;; - -acis*) - os=-aos - ;; - -atheos*) - os=-atheos - ;; - -syllable*) - os=-syllable - ;; - -386bsd) - os=-bsd - ;; - -ctix* | -uts*) - os=-sysv - ;; - -nova*) - os=-rtmk-nova - ;; - -ns2 ) - os=-nextstep2 - ;; - -nsk*) - os=-nsk - ;; - # Preserve the version number of sinix5. - -sinix5.*) - os=`echo $os | sed -e 's|sinix|sysv|'` - ;; - -sinix*) - os=-sysv4 - ;; - -tpf*) - os=-tpf - ;; - -triton*) - os=-sysv3 - ;; - -oss*) - os=-sysv3 - ;; - -svr4) - os=-sysv4 - ;; - -svr3) - os=-sysv3 - ;; - -sysvr4) - os=-sysv4 - ;; - # This must come after -sysvr4. - -sysv*) - ;; - -ose*) - os=-ose - ;; - -es1800*) - os=-ose - ;; - -xenix) - os=-xenix - ;; - -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) - os=-mint - ;; - -aros*) - os=-aros - ;; - -kaos*) - os=-kaos - ;; - -zvmoe) - os=-zvmoe - ;; - -dicos*) - os=-dicos - ;; - -nacl*) - ;; - -none) - ;; - *) - # Get rid of the `-' at the beginning of $os. - os=`echo $os | sed 's/[^-]*-//'` - echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2 - exit 1 - ;; -esac -else - -# Here we handle the default operating systems that come with various machines. -# The value should be what the vendor currently ships out the door with their -# machine or put another way, the most popular os provided with the machine. - -# Note that if you're going to try to match "-MANUFACTURER" here (say, -# "-sun"), then you have to tell the case statement up towards the top -# that MANUFACTURER isn't an operating system. Otherwise, code above -# will signal an error saying that MANUFACTURER isn't an operating -# system, and we'll never get to this point. - -case $basic_machine in - score-*) - os=-elf - ;; - spu-*) - os=-elf - ;; - *-acorn) - os=-riscix1.2 - ;; - arm*-rebel) - os=-linux - ;; - arm*-semi) - os=-aout - ;; - c4x-* | tic4x-*) - os=-coff - ;; - tic54x-*) - os=-coff - ;; - tic55x-*) - os=-coff - ;; - tic6x-*) - os=-coff - ;; - # This must come before the *-dec entry. - pdp10-*) - os=-tops20 - ;; - pdp11-*) - os=-none - ;; - *-dec | vax-*) - os=-ultrix4.2 - ;; - m68*-apollo) - os=-domain - ;; - i386-sun) - os=-sunos4.0.2 - ;; - m68000-sun) - os=-sunos3 - # This also exists in the configure program, but was not the - # default. - # os=-sunos4 - ;; - m68*-cisco) - os=-aout - ;; - mep-*) - os=-elf - ;; - mips*-cisco) - os=-elf - ;; - mips*-*) - os=-elf - ;; - or32-*) - os=-coff - ;; - *-tti) # must be before sparc entry or we get the wrong os. - os=-sysv3 - ;; - sparc-* | *-sun) - os=-sunos4.1.1 - ;; - *-be) - os=-beos - ;; - *-haiku) - os=-haiku - ;; - *-ibm) - os=-aix - ;; - *-knuth) - os=-mmixware - ;; - *-wec) - os=-proelf - ;; - *-winbond) - os=-proelf - ;; - *-oki) - os=-proelf - ;; - *-hp) - os=-hpux - ;; - *-hitachi) - os=-hiux - ;; - i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent) - os=-sysv - ;; - *-cbm) - os=-amigaos - ;; - *-dg) - os=-dgux - ;; - *-dolphin) - os=-sysv3 - ;; - m68k-ccur) - os=-rtu - ;; - m88k-omron*) - os=-luna - ;; - *-next ) - os=-nextstep - ;; - *-sequent) - os=-ptx - ;; - *-crds) - os=-unos - ;; - *-ns) - os=-genix - ;; - i370-*) - os=-mvs - ;; - *-next) - os=-nextstep3 - ;; - *-gould) - os=-sysv - ;; - *-highlevel) - os=-bsd - ;; - *-encore) - os=-bsd - ;; - *-sgi) - os=-irix - ;; - *-siemens) - os=-sysv4 - ;; - *-masscomp) - os=-rtu - ;; - f30[01]-fujitsu | f700-fujitsu) - os=-uxpv - ;; - *-rom68k) - os=-coff - ;; - *-*bug) - os=-coff - ;; - *-apple) - os=-macos - ;; - *-atari*) - os=-mint - ;; - *) - os=-none - ;; -esac -fi - -# Here we handle the case where we know the os, and the CPU type, but not the -# manufacturer. We pick the logical manufacturer. -vendor=unknown -case $basic_machine in - *-unknown) - case $os in - -riscix*) - vendor=acorn - ;; - -sunos*) - vendor=sun - ;; - -cnk*|-aix*) - vendor=ibm - ;; - -beos*) - vendor=be - ;; - -hpux*) - vendor=hp - ;; - -mpeix*) - vendor=hp - ;; - -hiux*) - vendor=hitachi - ;; - -unos*) - vendor=crds - ;; - -dgux*) - vendor=dg - ;; - -luna*) - vendor=omron - ;; - -genix*) - vendor=ns - ;; - -mvs* | -opened*) - vendor=ibm - ;; - -os400*) - vendor=ibm - ;; - -ptx*) - vendor=sequent - ;; - -tpf*) - vendor=ibm - ;; - -vxsim* | -vxworks* | -windiss*) - vendor=wrs - ;; - -aux*) - vendor=apple - ;; - -hms*) - vendor=hitachi - ;; - -mpw* | -macos*) - vendor=apple - ;; - -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) - vendor=atari - ;; - -vos*) - vendor=stratus - ;; - esac - basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"` - ;; -esac - -echo $basic_machine$os -exit - -# Local variables: -# eval: (add-hook 'write-file-hooks 'time-stamp) -# time-stamp-start: "timestamp='" -# time-stamp-format: "%:y-%02m-%02d" -# time-stamp-end: "'" -# End: diff --git a/volk/config.sub b/volk/config.sub new file mode 120000 index 000000000..4d47fbcbc --- /dev/null +++ b/volk/config.sub @@ -0,0 +1 @@ +/usr/share/automake-1.11/config.sub \ No newline at end of file diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 99276ab87..aef1d7ba8 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -41,93 +41,93 @@ volkinclude_HEADERS = \ volk.h \ volk_cpu.h \ volk_environment_init.h \ - volk_16s_add_quad_aligned16.h \ - volk_16s_branch_4_state_8_aligned16.h \ - volk_16sc_deinterleave_16s_aligned16.h \ - volk_16sc_deinterleave_32f_aligned16.h \ - volk_16sc_deinterleave_real_16s_aligned16.h \ - volk_16sc_deinterleave_real_32f_aligned16.h \ - volk_16sc_deinterleave_real_8s_aligned16.h \ - volk_16sc_magnitude_16s_aligned16.h \ - volk_16sc_magnitude_32f_aligned16.h \ - volk_16s_convert_32f_aligned16.h \ - volk_16s_convert_32f_unaligned16.h \ - volk_16s_convert_8s_aligned16.h \ - volk_16s_convert_8s_unaligned16.h \ - volk_16s_max_star_aligned16.h \ - volk_16s_max_star_horizontal_aligned16.h \ - volk_16s_permute_and_scalar_add_aligned16.h \ - volk_16s_quad_max_star_aligned16.h \ - volk_16u_byteswap_aligned16.h \ - volk_32f_accumulator_aligned16.h \ - volk_32f_add_aligned16.h \ - volk_32fc_32f_multiply_aligned16.h \ - volk_32fc_32f_power_32fc_aligned16.h \ - volk_32f_calc_spectral_noise_floor_aligned16.h \ - volk_32fc_atan2_32f_aligned16.h \ - volk_32fc_conjugate_dot_prod_aligned16.h \ - volk_32fc_deinterleave_32f_aligned16.h \ - volk_32fc_deinterleave_64f_aligned16.h \ - volk_32fc_deinterleave_real_16s_aligned16.h \ - volk_32fc_deinterleave_real_32f_aligned16.h \ - volk_32fc_deinterleave_real_64f_aligned16.h \ - volk_32fc_dot_prod_aligned16.h \ - volk_32fc_index_max_aligned16.h \ - volk_32fc_magnitude_16s_aligned16.h \ - volk_32fc_magnitude_32f_aligned16.h \ - volk_32fc_multiply_aligned16.h \ - volk_32f_convert_16s_aligned16.h \ - volk_32f_convert_16s_unaligned16.h \ - volk_32f_convert_32s_aligned16.h \ - volk_32f_convert_32s_unaligned16.h \ - volk_32f_convert_64f_aligned16.h \ - volk_32f_convert_64f_unaligned16.h \ - volk_32f_convert_8s_aligned16.h \ - volk_32f_convert_8s_unaligned16.h \ - volk_32fc_power_spectral_density_32f_aligned16.h \ - volk_32fc_power_spectrum_32f_aligned16.h \ - volk_32fc_square_dist_aligned16.h \ - volk_32fc_square_dist_scalar_mult_aligned16.h \ - volk_32f_divide_aligned16.h \ - volk_32f_dot_prod_aligned16.h \ - volk_32f_dot_prod_unaligned16.h \ - volk_32f_fm_detect_aligned16.h \ - volk_32f_index_max_aligned16.h \ - volk_32f_interleave_16sc_aligned16.h \ - volk_32f_interleave_32fc_aligned16.h \ - volk_32f_max_aligned16.h \ - volk_32f_min_aligned16.h \ - volk_32f_multiply_aligned16.h \ - volk_32f_normalize_aligned16.h \ - volk_32f_power_aligned16.h \ - volk_32f_sqrt_aligned16.h \ - volk_32f_stddev_aligned16.h \ - volk_32f_stddev_and_mean_aligned16.h \ - volk_32f_subtract_aligned16.h \ - volk_32f_sum_of_poly_aligned16.h \ - volk_32s_and_aligned16.h \ - volk_32s_convert_32f_aligned16.h \ - volk_32s_convert_32f_unaligned16.h \ - volk_32s_or_aligned16.h \ - volk_32u_byteswap_aligned16.h \ - volk_32u_popcnt_aligned16.h \ - volk_64f_convert_32f_aligned16.h \ - volk_64f_convert_32f_unaligned16.h \ - volk_64f_max_aligned16.h \ - volk_64f_min_aligned16.h \ - volk_64u_byteswap_aligned16.h \ - volk_64u_popcnt_aligned16.h \ - volk_8sc_deinterleave_16s_aligned16.h \ - volk_8sc_deinterleave_32f_aligned16.h \ - volk_8sc_deinterleave_real_16s_aligned16.h \ - volk_8sc_deinterleave_real_32f_aligned16.h \ - volk_8sc_deinterleave_real_8s_aligned16.h \ - volk_8sc_multiply_conjugate_16sc_aligned16.h \ - volk_8sc_multiply_conjugate_32fc_aligned16.h \ - volk_8s_convert_16s_aligned16.h \ - volk_8s_convert_16s_unaligned16.h \ - volk_8s_convert_32f_aligned16.h \ - volk_8s_convert_32f_unaligned16.h + volk_16s_add_quad_a16.h \ + volk_16s_branch_4_state_8_a16.h \ + volk_16sc_deinterleave_16s_16s_a16.h \ + volk_16sc_s32f_deinterleave_32f_32f_a16.h \ + volk_16sc_deinterleave_real_16s_a16.h \ + volk_16sc_s32f_deinterleave_real_32f_a16.h \ + volk_16sc_deinterleave_real_8s_a16.h \ + volk_16sc_magnitude_16s_a16.h \ + volk_16sc_s32f_magnitude_32f_a16.h \ + volk_16s_s32f_convert_32f_a16.h \ + volk_16s_s32f_convert_32f_ua16.h \ + volk_16s_convert_8s_a16.h \ + volk_16s_convert_8s_ua16.h \ + volk_16s_max_star_16s_a16.h \ + volk_16s_max_star_horizontal_16s_a16.h \ + volk_16s_permute_and_scalar_add_a16.h \ + volk_16s_quad_max_star_16s_a16.h \ + volk_16u_byteswap_a16.h \ + volk_32f_accumulator_s32f_a16.h \ + volk_32f_32f_add_32f_a16.h \ + volk_32fc_32f_multiply_32fc_a16.h \ + volk_32fc_32f_power_32fc_a16.h \ + volk_32f_calc_spectral_noise_floor_a16.h \ + volk_32fc_s32f_atan2_32f_a16.h \ + volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h \ + volk_32fc_deinterleave_32f_32f_a16.h \ + volk_32fc_deinterleave_64f_64f_a16.h \ + volk_32fc_deinterleave_real_16s_a16.h \ + volk_32fc_deinterleave_real_32f_a16.h \ + volk_32fc_deinterleave_real_64f_a16.h \ + volk_32fc_32fc_dot_prod_32fc_a16.h \ + volk_32fc_index_max_16u_a16.h \ + volk_32fc_s32f_magnitude_16s_a16.h \ + volk_32fc_magnitude_32f_a16.h \ + volk_32fc_32fc_multiply_32fc_a16.h \ + volk_32f_s32f_convert_16s_a16.h \ + volk_32f_s32f_convert_16s_ua16.h \ + volk_32f_s32f_convert_32s_a16.h \ + volk_32f_s32f_convert_32s_ua16.h \ + volk_32f_convert_64f_a16.h \ + volk_32f_convert_64f_ua16.h \ + volk_32f_s32f_convert_8s_a16.h \ + volk_32f_s32f_convert_8s_ua16.h \ + volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h \ + volk_32fc_s32f_power_spectrum_32f_a16.h \ + volk_32fc_32fc_square_dist_32f_a16.h \ + volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h \ + volk_32f_32f_divide_32f_a16.h \ + volk_32f_32f_dot_prod_32f_a16.h \ + volk_32f_32f_dot_prod_32f_ua16.h \ + volk_32f_s32f_32f_fm_detect_32f_a16.h \ + volk_32f_index_max_16u_a16.h \ + volk_32f_32f_s32f_interleave_16sc_a16.h \ + volk_32f_32f_interleave_32fc_a16.h \ + volk_32f_32f_max_32f_a16.h \ + volk_32f_32f_min_32f_a16.h \ + volk_32f_32f_multiply_32f_a16.h \ + volk_32f_s32f_normalize_a16.h \ + volk_32f_s32f_power_32f_a16.h \ + volk_32f_sqrt_32f_a16.h \ + volk_32f_s32f_stddev_32f_a16.h \ + volk_32f_stddev_and_mean_32f_32f_a16.h \ + volk_32f_32f_subtract_32f_a16.h \ + volk_32f_32f_32f_sum_of_poly_32f_a16.h \ + volk_32s_32s_and_32s_a16.h \ + volk_32s_s32f_convert_32f_a16.h \ + volk_32s_s32f_convert_32f_ua16.h \ + volk_32s_32s_or_32s_a16.h \ + volk_32u_byteswap_a16.h \ + volk_32u_popcnt_a16.h \ + volk_64f_convert_32f_a16.h \ + volk_64f_convert_32f_ua16.h \ + volk_64f_64f_max_64f_a16.h \ + volk_64f_64f_min_64f_a16.h \ + volk_64u_byteswap_a16.h \ + volk_64u_popcnt_a16.h \ + volk_8sc_deinterleave_16s_16s_a16.h \ + volk_8sc_s32f_deinterleave_32f_32f_a16.h \ + volk_8sc_deinterleave_real_16s_a16.h \ + volk_8sc_s32f_deinterleave_real_32f_a16.h \ + volk_8sc_deinterleave_real_8s_a16.h \ + volk_8sc_8sc_multiply_conjugate_16sc_a16.h \ + volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h \ + volk_8s_convert_16s_a16.h \ + volk_8s_convert_16s_ua16.h \ + volk_8s_s32f_convert_32f_a16.h \ + volk_8s_s32f_convert_32f_ua16.h VOLK_MKTABLES_SOURCES = \ $(top_srcdir)/lib/volk_rank_archs.c \ diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index f2432d7a4..f708ba7d0 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -24,8 +24,7 @@ def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : tempstring = tempstring + " }\n" tempstring = tempstring + " return 0;\n" tempstring = tempstring + "}\n" - - + for i in range(len(funclist)): tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n"; diff --git a/volk/include/volk/volk_16s_add_quad_a16.h b/volk/include/volk/volk_16s_add_quad_a16.h new file mode 100644 index 000000000..67d0c55a3 --- /dev/null +++ b/volk/include/volk/volk_16s_add_quad_a16.h @@ -0,0 +1,136 @@ +#ifndef INCLUDED_volk_16s_add_quad_a16_H +#define INCLUDED_volk_16s_add_quad_a16_H + + +#include +#include + + + + + +#if LV_HAVE_SSE2 +#include +#include + +static inline void volk_16s_add_quad_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; + p_target0 = (__m128i*)target0; + p_target1 = (__m128i*)target1; + p_target2 = (__m128i*)target2; + p_target3 = (__m128i*)target3; + + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + p_src4 = (__m128i*)src4; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + for(; i < bound; ++i) { + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(p_src1); + xmm2 = _mm_load_si128(p_src2); + xmm3 = _mm_load_si128(p_src3); + xmm4 = _mm_load_si128(p_src4); + + p_src0 += 1; + p_src1 += 1; + + xmm1 = _mm_add_epi16(xmm0, xmm1); + xmm2 = _mm_add_epi16(xmm0, xmm2); + xmm3 = _mm_add_epi16(xmm0, xmm3); + xmm4 = _mm_add_epi16(xmm0, xmm4); + + + p_src2 += 1; + p_src3 += 1; + p_src4 += 1; + + _mm_store_si128(p_target0, xmm1); + _mm_store_si128(p_target1, xmm2); + _mm_store_si128(p_target2, xmm3); + _mm_store_si128(p_target3, xmm4); + + p_target0 += 1; + p_target1 += 1; + p_target2 += 1; + p_target3 += 1; + } + /*asm volatile + ( + ".%=volk_16s_add_quad_a16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je .%=volk_16s_add_quad_a16_sse2_END\n\t" + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + "movaps (%[src4]), %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $16, %[src1]\n\t" + "add $16, %[src2]\n\t" + "add $16, %[src3]\n\t" + "add $16, %[src4]\n\t" + "paddw %%xmm1, %%xmm2\n\t" + "paddw %%xmm1, %%xmm3\n\t" + "paddw %%xmm1, %%xmm4\n\t" + "paddw %%xmm1, %%xmm5\n\t" + "add $-1, %[bound]\n\t" + "movaps %%xmm2, (%[target0])\n\t" + "movaps %%xmm3, (%[target1])\n\t" + "movaps %%xmm4, (%[target2])\n\t" + "movaps %%xmm5, (%[target3])\n\t" + "add $16, %[target0]\n\t" + "add $16, %[target1]\n\t" + "add $16, %[target2]\n\t" + "add $16, %[target3]\n\t" + "jmp .%=volk_16s_add_quad_a16_sse2_L1\n\t" + ".%=volk_16s_add_quad_a16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) + :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + + */ + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} +#endif /*LV_HAVE_SSE2*/ + + +#if LV_HAVE_GENERIC + +static inline void volk_16s_add_quad_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} + +#endif /* LV_HAVE_GENERIC */ + + + + + +#endif /*INCLUDED_volk_16s_add_quad_a16_H*/ diff --git a/volk/include/volk/volk_16s_add_quad_aligned16.h b/volk/include/volk/volk_16s_add_quad_aligned16.h deleted file mode 100644 index 63042bef1..000000000 --- a/volk/include/volk/volk_16s_add_quad_aligned16.h +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H -#define INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H - - -#include -#include - - - - - -#if LV_HAVE_SSE2 -#include -#include - -static inline void volk_16s_add_quad_aligned16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; - p_target0 = (__m128i*)target0; - p_target1 = (__m128i*)target1; - p_target2 = (__m128i*)target2; - p_target3 = (__m128i*)target3; - - p_src0 = (__m128i*)src0; - p_src1 = (__m128i*)src1; - p_src2 = (__m128i*)src2; - p_src3 = (__m128i*)src3; - p_src4 = (__m128i*)src4; - - int i = 0; - - int bound = (num_bytes >> 4); - int leftovers = (num_bytes >> 1) & 7; - - for(; i < bound; ++i) { - xmm0 = _mm_load_si128(p_src0); - xmm1 = _mm_load_si128(p_src1); - xmm2 = _mm_load_si128(p_src2); - xmm3 = _mm_load_si128(p_src3); - xmm4 = _mm_load_si128(p_src4); - - p_src0 += 1; - p_src1 += 1; - - xmm1 = _mm_add_epi16(xmm0, xmm1); - xmm2 = _mm_add_epi16(xmm0, xmm2); - xmm3 = _mm_add_epi16(xmm0, xmm3); - xmm4 = _mm_add_epi16(xmm0, xmm4); - - - p_src2 += 1; - p_src3 += 1; - p_src4 += 1; - - _mm_store_si128(p_target0, xmm1); - _mm_store_si128(p_target1, xmm2); - _mm_store_si128(p_target2, xmm3); - _mm_store_si128(p_target3, xmm4); - - p_target0 += 1; - p_target1 += 1; - p_target2 += 1; - p_target3 += 1; - } - /*asm volatile - ( - ".%=volk_16s_add_quad_aligned16_sse2_L1:\n\t" - "cmp $0, %[bound]\n\t" - "je .%=volk_16s_add_quad_aligned16_sse2_END\n\t" - "movaps (%[src0]), %%xmm1\n\t" - "movaps (%[src1]), %%xmm2\n\t" - "movaps (%[src2]), %%xmm3\n\t" - "movaps (%[src3]), %%xmm4\n\t" - "movaps (%[src4]), %%xmm5\n\t" - "add $16, %[src0]\n\t" - "add $16, %[src1]\n\t" - "add $16, %[src2]\n\t" - "add $16, %[src3]\n\t" - "add $16, %[src4]\n\t" - "paddw %%xmm1, %%xmm2\n\t" - "paddw %%xmm1, %%xmm3\n\t" - "paddw %%xmm1, %%xmm4\n\t" - "paddw %%xmm1, %%xmm5\n\t" - "add $-1, %[bound]\n\t" - "movaps %%xmm2, (%[target0])\n\t" - "movaps %%xmm3, (%[target1])\n\t" - "movaps %%xmm4, (%[target2])\n\t" - "movaps %%xmm5, (%[target3])\n\t" - "add $16, %[target0]\n\t" - "add $16, %[target1]\n\t" - "add $16, %[target2]\n\t" - "add $16, %[target3]\n\t" - "jmp .%=volk_16s_add_quad_aligned16_sse2_L1\n\t" - ".%=volk_16s_add_quad_aligned16_sse2_END:\n\t" - : - :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) - :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - - */ - - - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target0[i] = src0[i] + src1[i]; - target1[i] = src0[i] + src2[i]; - target2[i] = src0[i] + src3[i]; - target3[i] = src0[i] + src4[i]; - } -} -#endif /*LV_HAVE_SSE2*/ - - -#if LV_HAVE_GENERIC - -static inline void volk_16s_add_quad_aligned16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - for(i = 0; i < bound; ++i) { - target0[i] = src0[i] + src1[i]; - target1[i] = src0[i] + src2[i]; - target2[i] = src0[i] + src3[i]; - target3[i] = src0[i] + src4[i]; - } -} - -#endif /* LV_HAVE_GENERIC */ - - - - - -#endif /*INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_branch_4_state_8_a16.h b/volk/include/volk/volk_16s_branch_4_state_8_a16.h new file mode 100644 index 000000000..4c1af8729 --- /dev/null +++ b/volk/include/volk/volk_16s_branch_4_state_8_a16.h @@ -0,0 +1,194 @@ +#ifndef INCLUDED_volk_16s_branch_4_state_8_a16_H +#define INCLUDED_volk_16s_branch_4_state_8_a16_H + + +#include +#include + + + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16s_branch_4_state_8_a16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; + + __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; + + + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = 1; + + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + xmm0 = _mm_load_si128((__m128i*)permuters[0]); + xmm6 = _mm_load_si128((__m128i*)permuters[1]); + xmm8 = _mm_load_si128((__m128i*)permuters[2]); + xmm10 = _mm_load_si128((__m128i*)permuters[3]); + + for(; i < bound; ++i) { + + xmm5 = _mm_load_si128(p_src0); + + + + + + + + + + xmm0 = _mm_shuffle_epi8(xmm5, xmm0); + xmm6 = _mm_shuffle_epi8(xmm5, xmm6); + xmm8 = _mm_shuffle_epi8(xmm5, xmm8); + xmm10 = _mm_shuffle_epi8(xmm5, xmm10); + + p_src0 += 4; + + + xmm5 = _mm_add_epi16(xmm1, xmm2); + + xmm6 = _mm_add_epi16(xmm2, xmm6); + xmm8 = _mm_add_epi16(xmm1, xmm8); + + + xmm7 = _mm_load_si128(p_cntl2); + xmm9 = _mm_load_si128(p_cntl3); + + xmm0 = _mm_add_epi16(xmm5, xmm0); + + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm5 = _mm_load_si128(&p_cntl2[1]); + xmm11 = _mm_load_si128(&p_cntl3[1]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm7); + + + + xmm7 = _mm_load_si128(&p_cntl2[2]); + xmm9 = _mm_load_si128(&p_cntl3[2]); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm6 = _mm_add_epi16(xmm6, xmm5); + + + xmm5 = _mm_load_si128(&p_cntl2[3]); + xmm11 = _mm_load_si128(&p_cntl3[3]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm8 = _mm_add_epi16(xmm8, xmm7); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + _mm_store_si128(p_target, xmm0); + _mm_store_si128(&p_target[1], xmm6); + + xmm10 = _mm_add_epi16(xmm5, xmm10); + + _mm_store_si128(&p_target[2], xmm8); + + _mm_store_si128(&p_target[3], xmm10); + + p_target += 3; + } +} + + +#endif /*LV_HAVE_SSEs*/ + +#if LV_HAVE_GENERIC +static inline void volk_16s_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + int i = 0; + + int bound = 4; + + for(; i < bound; ++i) { + target[i* 8] = src0[((char)permuters[i][0])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8] & scalars[2]) + + (cntl3[i * 8] & scalars[3]); + target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 1] & scalars[2]) + + (cntl3[i * 8 + 1] & scalars[3]); + target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 2] & scalars[2]) + + (cntl3[i * 8 + 2] & scalars[3]); + target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 3] & scalars[2]) + + (cntl3[i * 8 + 3] & scalars[3]); + target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 4] & scalars[2]) + + (cntl3[i * 8 + 4] & scalars[3]); + target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 5] & scalars[2]) + + (cntl3[i * 8 + 5] & scalars[3]); + target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 6] & scalars[2]) + + (cntl3[i * 8 + 6] & scalars[3]); + target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 7] & scalars[2]) + + (cntl3[i * 8 + 7] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16s_branch_4_state_8_a16_H*/ diff --git a/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h b/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h deleted file mode 100644 index fb9d7cb87..000000000 --- a/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h +++ /dev/null @@ -1,194 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H -#define INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H - - -#include -#include - - - - -#if LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16s_branch_4_state_8_aligned16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { - - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; - - __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; - - - - p_target = (__m128i*)target; - p_src0 = (__m128i*)src0; - p_cntl2 = (__m128i*)cntl2; - p_cntl3 = (__m128i*)cntl3; - p_scalars = (__m128i*)scalars; - - int i = 0; - - int bound = 1; - - - xmm0 = _mm_load_si128(p_scalars); - - xmm1 = _mm_shufflelo_epi16(xmm0, 0); - xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); - xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); - xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - - xmm1 = _mm_shuffle_epi32(xmm1, 0x00); - xmm2 = _mm_shuffle_epi32(xmm2, 0x00); - xmm3 = _mm_shuffle_epi32(xmm3, 0x00); - xmm4 = _mm_shuffle_epi32(xmm4, 0x00); - - xmm0 = _mm_load_si128((__m128i*)permuters[0]); - xmm6 = _mm_load_si128((__m128i*)permuters[1]); - xmm8 = _mm_load_si128((__m128i*)permuters[2]); - xmm10 = _mm_load_si128((__m128i*)permuters[3]); - - for(; i < bound; ++i) { - - xmm5 = _mm_load_si128(p_src0); - - - - - - - - - - xmm0 = _mm_shuffle_epi8(xmm5, xmm0); - xmm6 = _mm_shuffle_epi8(xmm5, xmm6); - xmm8 = _mm_shuffle_epi8(xmm5, xmm8); - xmm10 = _mm_shuffle_epi8(xmm5, xmm10); - - p_src0 += 4; - - - xmm5 = _mm_add_epi16(xmm1, xmm2); - - xmm6 = _mm_add_epi16(xmm2, xmm6); - xmm8 = _mm_add_epi16(xmm1, xmm8); - - - xmm7 = _mm_load_si128(p_cntl2); - xmm9 = _mm_load_si128(p_cntl3); - - xmm0 = _mm_add_epi16(xmm5, xmm0); - - - xmm7 = _mm_and_si128(xmm7, xmm3); - xmm9 = _mm_and_si128(xmm9, xmm4); - - xmm5 = _mm_load_si128(&p_cntl2[1]); - xmm11 = _mm_load_si128(&p_cntl3[1]); - - xmm7 = _mm_add_epi16(xmm7, xmm9); - - xmm5 = _mm_and_si128(xmm5, xmm3); - xmm11 = _mm_and_si128(xmm11, xmm4); - - xmm0 = _mm_add_epi16(xmm0, xmm7); - - - - xmm7 = _mm_load_si128(&p_cntl2[2]); - xmm9 = _mm_load_si128(&p_cntl3[2]); - - xmm5 = _mm_add_epi16(xmm5, xmm11); - - xmm7 = _mm_and_si128(xmm7, xmm3); - xmm9 = _mm_and_si128(xmm9, xmm4); - - xmm6 = _mm_add_epi16(xmm6, xmm5); - - - xmm5 = _mm_load_si128(&p_cntl2[3]); - xmm11 = _mm_load_si128(&p_cntl3[3]); - - xmm7 = _mm_add_epi16(xmm7, xmm9); - - xmm5 = _mm_and_si128(xmm5, xmm3); - xmm11 = _mm_and_si128(xmm11, xmm4); - - xmm8 = _mm_add_epi16(xmm8, xmm7); - - xmm5 = _mm_add_epi16(xmm5, xmm11); - - _mm_store_si128(p_target, xmm0); - _mm_store_si128(&p_target[1], xmm6); - - xmm10 = _mm_add_epi16(xmm5, xmm10); - - _mm_store_si128(&p_target[2], xmm8); - - _mm_store_si128(&p_target[3], xmm10); - - p_target += 3; - } -} - - -#endif /*LV_HAVE_SSEs*/ - -#if LV_HAVE_GENERIC -static inline void volk_16s_branch_4_state_8_aligned16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { - int i = 0; - - int bound = 4; - - for(; i < bound; ++i) { - target[i* 8] = src0[((char)permuters[i][0])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8] & scalars[2]) - + (cntl3[i * 8] & scalars[3]); - target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 1] & scalars[2]) - + (cntl3[i * 8 + 1] & scalars[3]); - target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 2] & scalars[2]) - + (cntl3[i * 8 + 2] & scalars[3]); - target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 3] & scalars[2]) - + (cntl3[i * 8 + 3] & scalars[3]); - target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 4] & scalars[2]) - + (cntl3[i * 8 + 4] & scalars[3]); - target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 5] & scalars[2]) - + (cntl3[i * 8 + 5] & scalars[3]); - target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 6] & scalars[2]) - + (cntl3[i * 8 + 6] & scalars[3]); - target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 7] & scalars[2]) - + (cntl3[i * 8 + 7] & scalars[3]); - - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_convert_32f_aligned16.h b/volk/include/volk/volk_16s_convert_32f_aligned16.h deleted file mode 100644 index 126ce1528..000000000 --- a/volk/include/volk/volk_16s_convert_32f_aligned16.h +++ /dev/null @@ -1,119 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H -#define INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16s_convert_32f_aligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; - - for(;number < eighthPoints; number++){ - - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); - - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - - inputPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16s_convert_32f_aligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16s_convert_32f_aligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_convert_32f_unaligned16.h b/volk/include/volk/volk_16s_convert_32f_unaligned16.h deleted file mode 100644 index d6212fba5..000000000 --- a/volk/include/volk/volk_16s_convert_32f_unaligned16.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H -#define INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_16s_convert_32f_unaligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; - - for(;number < eighthPoints; number++){ - - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); - - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - - inputPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_16s_convert_32f_unaligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_16s_convert_32f_unaligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_convert_8s_a16.h b/volk/include/volk/volk_16s_convert_8s_a16.h new file mode 100644 index 000000000..13db435de --- /dev/null +++ b/volk/include/volk/volk_16s_convert_8s_a16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_volk_16s_convert_8s_a16_H +#define INCLUDED_volk_16s_convert_8s_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16s_convert_8s_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16s_convert_8s_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16s_convert_8s_a16_H */ diff --git a/volk/include/volk/volk_16s_convert_8s_aligned16.h b/volk/include/volk/volk_16s_convert_8s_aligned16.h deleted file mode 100644 index 64c368688..000000000 --- a/volk/include/volk/volk_16s_convert_8s_aligned16.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H -#define INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16s_convert_8s_aligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal1; - __m128i inputVal2; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - - // Load the 16 values - inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - - inputVal1 = _mm_srai_epi16(inputVal1, 8); - inputVal2 = _mm_srai_epi16(inputVal2, 8); - - ret = _mm_packs_epi16(inputVal1, inputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, ret); - - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16s_convert_8s_aligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_convert_8s_ua16.h b/volk/include/volk/volk_16s_convert_8s_ua16.h new file mode 100644 index 000000000..9941118ae --- /dev/null +++ b/volk/include/volk/volk_16s_convert_8s_ua16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_16s_convert_8s_ua16_H +#define INCLUDED_volk_16s_convert_8s_ua16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned +*/ +static inline void volk_16s_convert_8s_ua16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned +*/ +static inline void volk_16s_convert_8s_ua16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16s_convert_8s_ua16_H */ diff --git a/volk/include/volk/volk_16s_convert_8s_unaligned16.h b/volk/include/volk/volk_16s_convert_8s_unaligned16.h deleted file mode 100644 index ca925de86..000000000 --- a/volk/include/volk/volk_16s_convert_8s_unaligned16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H -#define INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned -*/ -static inline void volk_16s_convert_8s_unaligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal1; - __m128i inputVal2; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - - // Load the 16 values - inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; - inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; - - inputVal1 = _mm_srai_epi16(inputVal1, 8); - inputVal2 = _mm_srai_epi16(inputVal2, 8); - - ret = _mm_packs_epi16(inputVal1, inputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, ret); - - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned -*/ -static inline void volk_16s_convert_8s_unaligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_16s_max_star_16s_a16.h b/volk/include/volk/volk_16s_max_star_16s_a16.h new file mode 100644 index 000000000..b2ec90552 --- /dev/null +++ b/volk/include/volk/volk_16s_max_star_16s_a16.h @@ -0,0 +1,108 @@ +#ifndef INCLUDED_volk_16s_max_star_16s_a16_H +#define INCLUDED_volk_16s_max_star_16s_a16_H + + +#include +#include + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16s_max_star_16s_a16_ssse3(short* target, short* src0, unsigned int num_bytes) { + + + + short candidate = src0[0]; + short cands[8]; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; + + + __m128i *p_src0; + + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + xmm1 = _mm_setzero_si128(); + xmm0 = _mm_setzero_si128(); + //_mm_insert_epi16(xmm0, candidate, 0); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm1); + + + for(i = 0; i < bound; ++i) { + xmm1 = _mm_load_si128(p_src0); + p_src0 += 1; + xmm2 = _mm_sub_epi16(xmm1, xmm0); + + + + + + + xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); + xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); + xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); + + xmm6 = _mm_xor_si128(xmm4, xmm5); + + xmm3 = _mm_and_si128(xmm3, xmm0); + xmm4 = _mm_and_si128(xmm6, xmm1); + + xmm0 = _mm_add_epi16(xmm3, xmm4); + + + } + + _mm_store_si128((__m128i*)cands, xmm0); + + for(i = 0; i < 8; ++i) { + candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; + } + + + + for(i = 0; i < leftovers; ++i) { + + candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; + } + + target[0] = candidate; + + + + + +} + +#endif /*LV_HAVE_SSSE3*/ + +#if LV_HAVE_GENERIC + +static inline void volk_16s_max_star_16s_a16_generic(short* target, short* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short candidate = src0[0]; + for(i = 1; i < bound; ++i) { + candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; + } + target[0] = candidate; + +} + + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16s_max_star_16s_a16_H*/ diff --git a/volk/include/volk/volk_16s_max_star_aligned16.h b/volk/include/volk/volk_16s_max_star_aligned16.h deleted file mode 100644 index ba4e979ec..000000000 --- a/volk/include/volk/volk_16s_max_star_aligned16.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H -#define INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H - - -#include -#include - - -#if LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16s_max_star_aligned16_ssse3(short* target, short* src0, unsigned int num_bytes) { - - - - short candidate = src0[0]; - short cands[8]; - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; - - - __m128i *p_src0; - - p_src0 = (__m128i*)src0; - - int bound = num_bytes >> 4; - int leftovers = (num_bytes >> 1) & 7; - - int i = 0; - - - xmm1 = _mm_setzero_si128(); - xmm0 = _mm_setzero_si128(); - //_mm_insert_epi16(xmm0, candidate, 0); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm1); - - - for(i = 0; i < bound; ++i) { - xmm1 = _mm_load_si128(p_src0); - p_src0 += 1; - xmm2 = _mm_sub_epi16(xmm1, xmm0); - - - - - - - xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); - xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); - xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); - - xmm6 = _mm_xor_si128(xmm4, xmm5); - - xmm3 = _mm_and_si128(xmm3, xmm0); - xmm4 = _mm_and_si128(xmm6, xmm1); - - xmm0 = _mm_add_epi16(xmm3, xmm4); - - - } - - _mm_store_si128((__m128i*)cands, xmm0); - - for(i = 0; i < 8; ++i) { - candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; - } - - - - for(i = 0; i < leftovers; ++i) { - - candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; - } - - target[0] = candidate; - - - - - -} - -#endif /*LV_HAVE_SSSE3*/ - -#if LV_HAVE_GENERIC - -static inline void volk_16s_max_star_aligned16_generic(short* target, short* src0, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - short candidate = src0[0]; - for(i = 1; i < bound; ++i) { - candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; - } - target[0] = candidate; - -} - - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h b/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h new file mode 100644 index 000000000..68994593b --- /dev/null +++ b/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h @@ -0,0 +1,130 @@ +#ifndef INCLUDED_volk_16s_max_star_horizontal_16s_a16_H +#define INCLUDED_volk_16s_max_star_horizontal_16s_a16_H + + +#include +#include + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16s_max_star_horizontal_16s_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; + const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; + + + + volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i xmm5, xmm6, xmm7, xmm8; + + xmm4 = _mm_load_si128((__m128i*)shufmask0); + xmm5 = _mm_load_si128((__m128i*)shufmask1); + xmm6 = _mm_load_si128((__m128i*)andmask0); + xmm7 = _mm_load_si128((__m128i*)andmask1); + + __m128i *p_target, *p_src0; + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 5; + int intermediate = (num_bytes >> 4) & 1; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + for(i = 0; i < bound; ++i) { + + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(&p_src0[1]); + + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 2; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + xmm3 = _mm_and_si128(xmm2, xmm7); + + + xmm8 = _mm_add_epi8(xmm8, xmm4); + xmm3 = _mm_add_epi8(xmm3, xmm5); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm8); + xmm1 = _mm_shuffle_epi8(xmm1, xmm3); + + + xmm3 = _mm_add_epi16(xmm0, xmm1); + + + _mm_store_si128(p_target, xmm3); + + p_target += 1; + + } + + for(i = 0; i < intermediate; ++i) { + + xmm0 = _mm_load_si128(p_src0); + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 1; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + + xmm3 = _mm_add_epi8(xmm8, xmm4); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm3); + + + _mm_storel_pd((double*)p_target, (__m128d)xmm0); + + p_target = (__m128i*)((int8_t*)p_target + 8); + + } + + for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { + target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; + } + + +} + +#endif /*LV_HAVE_SSSE3*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16s_max_star_horizontal_16s_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + + for(i = 0; i < bound; i += 2) { + target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; + } + +} + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_16s_max_star_horizontal_16s_a16_H*/ diff --git a/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h b/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h deleted file mode 100644 index 82d011677..000000000 --- a/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h +++ /dev/null @@ -1,130 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H -#define INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H - - -#include -#include - - -#if LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16s_max_star_horizontal_aligned16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { - - const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; - const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; - const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; - - - - volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - __m128i xmm5, xmm6, xmm7, xmm8; - - xmm4 = _mm_load_si128((__m128i*)shufmask0); - xmm5 = _mm_load_si128((__m128i*)shufmask1); - xmm6 = _mm_load_si128((__m128i*)andmask0); - xmm7 = _mm_load_si128((__m128i*)andmask1); - - __m128i *p_target, *p_src0; - - p_target = (__m128i*)target; - p_src0 = (__m128i*)src0; - - int bound = num_bytes >> 5; - int intermediate = (num_bytes >> 4) & 1; - int leftovers = (num_bytes >> 1) & 7; - - int i = 0; - - - for(i = 0; i < bound; ++i) { - - xmm0 = _mm_load_si128(p_src0); - xmm1 = _mm_load_si128(&p_src0[1]); - - - - xmm2 = _mm_xor_si128(xmm2, xmm2); - p_src0 += 2; - - xmm3 = _mm_hsub_epi16(xmm0, xmm1); - - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); - - xmm8 = _mm_and_si128(xmm2, xmm6); - xmm3 = _mm_and_si128(xmm2, xmm7); - - - xmm8 = _mm_add_epi8(xmm8, xmm4); - xmm3 = _mm_add_epi8(xmm3, xmm5); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm8); - xmm1 = _mm_shuffle_epi8(xmm1, xmm3); - - - xmm3 = _mm_add_epi16(xmm0, xmm1); - - - _mm_store_si128(p_target, xmm3); - - p_target += 1; - - } - - for(i = 0; i < intermediate; ++i) { - - xmm0 = _mm_load_si128(p_src0); - - - xmm2 = _mm_xor_si128(xmm2, xmm2); - p_src0 += 1; - - xmm3 = _mm_hsub_epi16(xmm0, xmm1); - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); - - xmm8 = _mm_and_si128(xmm2, xmm6); - - xmm3 = _mm_add_epi8(xmm8, xmm4); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm3); - - - _mm_storel_pd((double*)p_target, (__m128d)xmm0); - - p_target = (__m128i*)((int8_t*)p_target + 8); - - } - - for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { - target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; - } - - -} - -#endif /*LV_HAVE_SSSE3*/ - - -#if LV_HAVE_GENERIC -static inline void volk_16s_max_star_horizontal_aligned16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - - for(i = 0; i < bound; i += 2) { - target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; - } - -} - - - -#endif /*LV_HAVE_GENERIC*/ - -#endif /*INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h new file mode 100644 index 000000000..2e7586b57 --- /dev/null +++ b/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h @@ -0,0 +1,139 @@ +#ifndef INCLUDED_volk_16s_permute_and_scalar_add_a16_H +#define INCLUDED_volk_16s_permute_and_scalar_add_a16_H + + +#include +#include + + + + +#if LV_HAVE_SSE2 + +#include +#include + +static inline void volk_16s_permute_and_scalar_add_a16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; + + short* p_permute_indexes = permute_indexes; + + p_target = (__m128i*)target; + p_cntl0 = (__m128i*)cntl0; + p_cntl1 = (__m128i*)cntl1; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + + for(; i < bound; ++i) { + xmm0 = _mm_setzero_si128(); + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = _mm_setzero_si128(); + + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_permute_indexes += 8; + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + xmm5 = _mm_load_si128(p_cntl0); + xmm6 = _mm_load_si128(p_cntl1); + xmm7 = _mm_load_si128(p_cntl2); + + xmm5 = _mm_and_si128(xmm5, xmm1); + xmm6 = _mm_and_si128(xmm6, xmm2); + xmm7 = _mm_and_si128(xmm7, xmm3); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + xmm5 = _mm_load_si128(p_cntl3); + + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_cntl0 += 1; + + xmm5 = _mm_and_si128(xmm5, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + p_cntl1 += 1; + p_cntl2 += 1; + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + p_cntl3 += 1; + + _mm_store_si128(p_target, xmm0); + + p_target += 1; + } + + + + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + } +} +#endif /*LV_HAVE_SSEs*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16s_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16s_permute_and_scalar_add_a16_H*/ diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h deleted file mode 100644 index 452d05c4f..000000000 --- a/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h +++ /dev/null @@ -1,139 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H -#define INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H - - -#include -#include - - - - -#if LV_HAVE_SSE2 - -#include -#include - -static inline void volk_16s_permute_and_scalar_add_aligned16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; - - short* p_permute_indexes = permute_indexes; - - p_target = (__m128i*)target; - p_cntl0 = (__m128i*)cntl0; - p_cntl1 = (__m128i*)cntl1; - p_cntl2 = (__m128i*)cntl2; - p_cntl3 = (__m128i*)cntl3; - p_scalars = (__m128i*)scalars; - - int i = 0; - - int bound = (num_bytes >> 4); - int leftovers = (num_bytes >> 1) & 7; - - xmm0 = _mm_load_si128(p_scalars); - - xmm1 = _mm_shufflelo_epi16(xmm0, 0); - xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); - xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); - xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - - xmm1 = _mm_shuffle_epi32(xmm1, 0x00); - xmm2 = _mm_shuffle_epi32(xmm2, 0x00); - xmm3 = _mm_shuffle_epi32(xmm3, 0x00); - xmm4 = _mm_shuffle_epi32(xmm4, 0x00); - - - for(; i < bound; ++i) { - xmm0 = _mm_setzero_si128(); - xmm5 = _mm_setzero_si128(); - xmm6 = _mm_setzero_si128(); - xmm7 = _mm_setzero_si128(); - - xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); - xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); - xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); - xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); - xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); - xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); - xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); - xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); - - xmm0 = _mm_add_epi16(xmm0, xmm5); - xmm6 = _mm_add_epi16(xmm6, xmm7); - - p_permute_indexes += 8; - - xmm0 = _mm_add_epi16(xmm0, xmm6); - - xmm5 = _mm_load_si128(p_cntl0); - xmm6 = _mm_load_si128(p_cntl1); - xmm7 = _mm_load_si128(p_cntl2); - - xmm5 = _mm_and_si128(xmm5, xmm1); - xmm6 = _mm_and_si128(xmm6, xmm2); - xmm7 = _mm_and_si128(xmm7, xmm3); - - xmm0 = _mm_add_epi16(xmm0, xmm5); - - xmm5 = _mm_load_si128(p_cntl3); - - xmm6 = _mm_add_epi16(xmm6, xmm7); - - p_cntl0 += 1; - - xmm5 = _mm_and_si128(xmm5, xmm4); - - xmm0 = _mm_add_epi16(xmm0, xmm6); - - p_cntl1 += 1; - p_cntl2 += 1; - - xmm0 = _mm_add_epi16(xmm0, xmm5); - - p_cntl3 += 1; - - _mm_store_si128(p_target, xmm0); - - p_target += 1; - } - - - - - - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target[i] = src0[permute_indexes[i]] - + (cntl0[i] & scalars[0]) - + (cntl1[i] & scalars[1]) - + (cntl2[i] & scalars[2]) - + (cntl3[i] & scalars[3]); - } -} -#endif /*LV_HAVE_SSEs*/ - - -#if LV_HAVE_GENERIC -static inline void volk_16s_permute_and_scalar_add_aligned16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - for(i = 0; i < bound; ++i) { - target[i] = src0[permute_indexes[i]] - + (cntl0[i] & scalars[0]) - + (cntl1[i] & scalars[1]) - + (cntl2[i] & scalars[2]) - + (cntl3[i] & scalars[3]); - - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_quad_max_star_16s_a16.h b/volk/include/volk/volk_16s_quad_max_star_16s_a16.h new file mode 100644 index 000000000..3e89ff963 --- /dev/null +++ b/volk/include/volk/volk_16s_quad_max_star_16s_a16.h @@ -0,0 +1,191 @@ +#ifndef INCLUDED_volk_16s_quad_max_star_16s_a16_H +#define INCLUDED_volk_16s_quad_max_star_16s_a16_H + + +#include +#include + + + + + +#if LV_HAVE_SSE2 + +#include + +static inline void volk_16s_quad_max_star_16s_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + + + + int i = 0; + + int bound = (num_bytes >> 4); + int bound_copy = bound; + int leftovers = (num_bytes >> 1) & 7; + + __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; + p_target = (__m128i*) target; + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + + + + __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + while(bound_copy > 0) { + + xmm1 = _mm_load_si128(p_src0); + xmm2 = _mm_load_si128(p_src1); + xmm3 = _mm_load_si128(p_src2); + xmm4 = _mm_load_si128(p_src3); + + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = xmm1; + xmm8 = xmm3; + + + xmm1 = _mm_sub_epi16(xmm2, xmm1); + + + + xmm3 = _mm_sub_epi16(xmm4, xmm3); + + xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); + xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); + + + + xmm2 = _mm_and_si128(xmm5, xmm2); + xmm4 = _mm_and_si128(xmm6, xmm4); + xmm5 = _mm_andnot_si128(xmm5, xmm7); + xmm6 = _mm_andnot_si128(xmm6, xmm8); + + xmm5 = _mm_add_epi16(xmm2, xmm5); + xmm6 = _mm_add_epi16(xmm4, xmm6); + + + xmm1 = _mm_xor_si128(xmm1, xmm1); + xmm2 = xmm5; + xmm5 = _mm_sub_epi16(xmm6, xmm5); + p_src0 += 1; + bound_copy -= 1; + + xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); + p_src1 += 1; + + xmm6 = _mm_and_si128(xmm1, xmm6); + + xmm1 = _mm_andnot_si128(xmm1, xmm2); + p_src2 += 1; + + + + xmm1 = _mm_add_epi16(xmm6, xmm1); + p_src3 += 1; + + + _mm_store_si128(p_target, xmm1); + p_target += 1; + + } + + + /*asm volatile + ( + "volk_16s_quad_max_star_16s_a16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je volk_16s_quad_max_star_16s_a16_sse2_END\n\t" + + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "movaps %%xmm1, %%xmm7\n\t" + "movaps %%xmm3, %%xmm8\n\t" + "psubw %%xmm2, %%xmm1\n\t" + "psubw %%xmm4, %%xmm3\n\t" + + "pcmpgtw %%xmm1, %%xmm5\n\t" + "pcmpgtw %%xmm3, %%xmm6\n\t" + + "pand %%xmm5, %%xmm2\n\t" + "pand %%xmm6, %%xmm4\n\t" + "pandn %%xmm7, %%xmm5\n\t" + "pandn %%xmm8, %%xmm6\n\t" + + "paddw %%xmm2, %%xmm5\n\t" + "paddw %%xmm4, %%xmm6\n\t" + + "pxor %%xmm1, %%xmm1\n\t" + "movaps %%xmm5, %%xmm2\n\t" + + "psubw %%xmm6, %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $-1, %[bound]\n\t" + + "pcmpgtw %%xmm5, %%xmm1\n\t" + "add $16, %[src1]\n\t" + + "pand %%xmm1, %%xmm6\n\t" + + "pandn %%xmm2, %%xmm1\n\t" + "add $16, %[src2]\n\t" + + "paddw %%xmm6, %%xmm1\n\t" + "add $16, %[src3]\n\t" + + "movaps %%xmm1, (%[target])\n\t" + "addw $16, %[target]\n\t" + "jmp volk_16s_quad_max_star_16s_a16_sse2_L1\n\t" + + "volk_16s_quad_max_star_16s_a16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) + : + ); + */ + + short temp0 = 0; + short temp1 = 0; + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } + return; + + +} + +#endif /*LV_HAVE_SSE2*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16s_quad_max_star_16s_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short temp0 = 0; + short temp1 = 0; + for(i = 0; i < bound; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } +} + + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_16s_quad_max_star_16s_a16_H*/ diff --git a/volk/include/volk/volk_16s_quad_max_star_aligned16.h b/volk/include/volk/volk_16s_quad_max_star_aligned16.h deleted file mode 100644 index 1004c4d23..000000000 --- a/volk/include/volk/volk_16s_quad_max_star_aligned16.h +++ /dev/null @@ -1,191 +0,0 @@ -#ifndef INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H -#define INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H - - -#include -#include - - - - - -#if LV_HAVE_SSE2 - -#include - -static inline void volk_16s_quad_max_star_aligned16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - - - - - int i = 0; - - int bound = (num_bytes >> 4); - int bound_copy = bound; - int leftovers = (num_bytes >> 1) & 7; - - __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; - p_target = (__m128i*) target; - p_src0 = (__m128i*)src0; - p_src1 = (__m128i*)src1; - p_src2 = (__m128i*)src2; - p_src3 = (__m128i*)src3; - - - - __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - - while(bound_copy > 0) { - - xmm1 = _mm_load_si128(p_src0); - xmm2 = _mm_load_si128(p_src1); - xmm3 = _mm_load_si128(p_src2); - xmm4 = _mm_load_si128(p_src3); - - xmm5 = _mm_setzero_si128(); - xmm6 = _mm_setzero_si128(); - xmm7 = xmm1; - xmm8 = xmm3; - - - xmm1 = _mm_sub_epi16(xmm2, xmm1); - - - - xmm3 = _mm_sub_epi16(xmm4, xmm3); - - xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); - xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); - - - - xmm2 = _mm_and_si128(xmm5, xmm2); - xmm4 = _mm_and_si128(xmm6, xmm4); - xmm5 = _mm_andnot_si128(xmm5, xmm7); - xmm6 = _mm_andnot_si128(xmm6, xmm8); - - xmm5 = _mm_add_epi16(xmm2, xmm5); - xmm6 = _mm_add_epi16(xmm4, xmm6); - - - xmm1 = _mm_xor_si128(xmm1, xmm1); - xmm2 = xmm5; - xmm5 = _mm_sub_epi16(xmm6, xmm5); - p_src0 += 1; - bound_copy -= 1; - - xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); - p_src1 += 1; - - xmm6 = _mm_and_si128(xmm1, xmm6); - - xmm1 = _mm_andnot_si128(xmm1, xmm2); - p_src2 += 1; - - - - xmm1 = _mm_add_epi16(xmm6, xmm1); - p_src3 += 1; - - - _mm_store_si128(p_target, xmm1); - p_target += 1; - - } - - - /*asm volatile - ( - "volk_16s_quad_max_star_aligned16_sse2_L1:\n\t" - "cmp $0, %[bound]\n\t" - "je volk_16s_quad_max_star_aligned16_sse2_END\n\t" - - "movaps (%[src0]), %%xmm1\n\t" - "movaps (%[src1]), %%xmm2\n\t" - "movaps (%[src2]), %%xmm3\n\t" - "movaps (%[src3]), %%xmm4\n\t" - - "pxor %%xmm5, %%xmm5\n\t" - "pxor %%xmm6, %%xmm6\n\t" - "movaps %%xmm1, %%xmm7\n\t" - "movaps %%xmm3, %%xmm8\n\t" - "psubw %%xmm2, %%xmm1\n\t" - "psubw %%xmm4, %%xmm3\n\t" - - "pcmpgtw %%xmm1, %%xmm5\n\t" - "pcmpgtw %%xmm3, %%xmm6\n\t" - - "pand %%xmm5, %%xmm2\n\t" - "pand %%xmm6, %%xmm4\n\t" - "pandn %%xmm7, %%xmm5\n\t" - "pandn %%xmm8, %%xmm6\n\t" - - "paddw %%xmm2, %%xmm5\n\t" - "paddw %%xmm4, %%xmm6\n\t" - - "pxor %%xmm1, %%xmm1\n\t" - "movaps %%xmm5, %%xmm2\n\t" - - "psubw %%xmm6, %%xmm5\n\t" - "add $16, %[src0]\n\t" - "add $-1, %[bound]\n\t" - - "pcmpgtw %%xmm5, %%xmm1\n\t" - "add $16, %[src1]\n\t" - - "pand %%xmm1, %%xmm6\n\t" - - "pandn %%xmm2, %%xmm1\n\t" - "add $16, %[src2]\n\t" - - "paddw %%xmm6, %%xmm1\n\t" - "add $16, %[src3]\n\t" - - "movaps %%xmm1, (%[target])\n\t" - "addw $16, %[target]\n\t" - "jmp volk_16s_quad_max_star_aligned16_sse2_L1\n\t" - - "volk_16s_quad_max_star_aligned16_sse2_END:\n\t" - : - :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) - : - ); - */ - - short temp0 = 0; - short temp1 = 0; - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; - temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; - target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - } - return; - - -} - -#endif /*LV_HAVE_SSE2*/ - - -#if LV_HAVE_GENERIC -static inline void volk_16s_quad_max_star_aligned16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - short temp0 = 0; - short temp1 = 0; - for(i = 0; i < bound; ++i) { - temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; - temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; - target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - } -} - - - - -#endif /*LV_HAVE_GENERIC*/ - -#endif /*INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_a16.h b/volk/include/volk/volk_16s_s32f_convert_32f_a16.h new file mode 100644 index 000000000..8f9b44478 --- /dev/null +++ b/volk/include/volk/volk_16s_s32f_convert_32f_a16.h @@ -0,0 +1,119 @@ +#ifndef INCLUDED_volk_16s_s32f_convert_32f_a16_H +#define INCLUDED_volk_16s_s32f_convert_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16s_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16s_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16s_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h new file mode 100644 index 000000000..ad52aea1a --- /dev/null +++ b/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_volk_16s_s32f_convert_32f_ua16_H +#define INCLUDED_volk_16s_s32f_convert_32f_ua16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16s_s32f_convert_32f_ua16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16s_s32f_convert_32f_ua16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16s_s32f_convert_32f_ua16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h new file mode 100644 index 000000000..8e5da24ec --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H +#define INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_16s_16s_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); + __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); + qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = *int16ComplexVectorPtr++; + *qBufferPtr++ = *int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_16s_16s_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); + + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16sc_deinterleave_16s_16s_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16sc_deinterleave_16s_16s_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16sc_deinterleave_16s_16s_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h deleted file mode 100644 index cf94a3f38..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H -#define INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_16s_aligned16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); - __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); - qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = *int16ComplexVectorPtr++; - *qBufferPtr++ = *int16ComplexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#if LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_16s_aligned16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; - __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); - __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); - - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -extern void volk_16sc_deinterleave_16s_aligned16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16sc_deinterleave_16s_aligned16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16sc_deinterleave_16s_aligned16_orc_impl(iBuffer, qBuffer, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h deleted file mode 100644 index 50b8b62d5..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H -#define INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -static inline void volk_16sc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - uint64_t number = 0; - const uint64_t quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; - - float floatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - floatBuffer[0] = (float)(complexVectorPtr[0]); - floatBuffer[1] = (float)(complexVectorPtr[1]); - floatBuffer[2] = (float)(complexVectorPtr[2]); - floatBuffer[3] = (float)(complexVectorPtr[3]); - - floatBuffer[4] = (float)(complexVectorPtr[4]); - floatBuffer[5] = (float)(complexVectorPtr[5]); - floatBuffer[6] = (float)(complexVectorPtr[6]); - floatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&floatBuffer[0]); - cplxValue2 = _mm_load_ps(&floatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -static inline void volk_16sc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -extern void volk_16sc_deinterleave_32f_aligned16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16sc_deinterleave_32f_aligned16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16sc_deinterleave_32f_aligned16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h new file mode 100644 index 000000000..068c1350c --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_16sc_deinterleave_real_16s_a16_H +#define INCLUDED_volk_16sc_deinterleave_real_16s_a16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_16s_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + iOutputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_16s_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i complexVal1, complexVal2, iOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16sc_deinterleave_real_16s_a16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h deleted file mode 100644 index b594c85b8..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H -#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_16s_aligned16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i complexVal1, complexVal2, iOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); - - iOutputVal = _mm_or_si128(complexVal1, complexVal2); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - - -#if LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_16s_aligned16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i complexVal1, complexVal2, iOutputVal; - __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); - __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); - - iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h deleted file mode 100644 index 3e7be1e64..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h +++ /dev/null @@ -1,125 +0,0 @@ -#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H -#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_32f_aligned16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - - for(;number < quarterPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - iIntVal = _mm_cvtepi16_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; - sixteenTComplexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 iValue; - - const float iScalar = 1.0/scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; - - iValue = _mm_load_ps(floatBuffer); - - iValue = _mm_mul_ps(iValue, invScalar); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h new file mode 100644 index 000000000..afa21ebc4 --- /dev/null +++ b/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h @@ -0,0 +1,94 @@ +#ifndef INCLUDED_volk_16sc_deinterleave_real_8s_a16_H +#define INCLUDED_volk_16sc_deinterleave_real_8s_a16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + complexVal1 = _mm_or_si128(complexVal1, complexVal2); + + complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); + complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); + + complexVal3 = _mm_or_si128(complexVal3, complexVal4); + + + complexVal1 = _mm_srai_epi16(complexVal1, 8); + complexVal3 = _mm_srai_epi16(complexVal3, 8); + + iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); + int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16sc_deinterleave_real_8s_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16sc_deinterleave_real_8s_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16sc_deinterleave_real_8s_a16_orc_impl(iBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16sc_deinterleave_real_8s_a16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h deleted file mode 100644 index 2dd85a422..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H -#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_8s_aligned16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); - - complexVal1 = _mm_or_si128(complexVal1, complexVal2); - - complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); - complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); - - complexVal3 = _mm_or_si128(complexVal3, complexVal4); - - - complexVal1 = _mm_srai_epi16(complexVal1, 8); - complexVal3 = _mm_srai_epi16(complexVal3, 8); - - iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); - int16ComplexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256); - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -extern void volk_16sc_deinterleave_real_8s_aligned16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16sc_deinterleave_real_8s_aligned16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16sc_deinterleave_real_8s_aligned16_orc_impl(iBuffer, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_16s_a16.h b/volk/include/volk/volk_16sc_magnitude_16s_a16.h new file mode 100644 index 000000000..d832de5fe --- /dev/null +++ b/volk/include/volk/volk_16sc_magnitude_16s_a16.h @@ -0,0 +1,190 @@ +#ifndef INCLUDED_volk_16sc_magnitude_16s_a16_H +#define INCLUDED_volk_16sc_magnitude_16s_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, result; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + float inputFloatBuffer[4] __attribute__((aligned(128))); + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue1 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue2 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float scalar = 32768.0; + for(number = 0; number < num_points; number++){ + float real = ((float)(*complexVectorPtr++)) / scalar; + float imag = ((float)(*complexVectorPtr++)) / scalar; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC_DISABLED +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16sc_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); +static inline void volk_16sc_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16sc_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16sc_magnitude_16s_a16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h deleted file mode 100644 index 41e8751d6..000000000 --- a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h +++ /dev/null @@ -1,190 +0,0 @@ -#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H -#define INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_16s_aligned16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(32768.0); - __m128 invScalar = _mm_set_ps1(1.0/32768.0); - - __m128 cplxValue1, cplxValue2, result; - - float inputFloatBuffer[8] __attribute__((aligned(128))); - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - result = _mm_mul_ps(result, vScalar); // Scale the results - - _mm_store_ps(outputFloatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / 32768.0; - const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; - *magnitudeVectorPtr++ = (int16_t)(val1Result); - } -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_16s_aligned16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(32768.0); - __m128 invScalar = _mm_set_ps1(1.0/32768.0); - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - - float inputFloatBuffer[4] __attribute__((aligned(128))); - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - cplxValue1 = _mm_load_ps(inputFloatBuffer); - complexVectorPtr += 4; - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - cplxValue2 = _mm_load_ps(inputFloatBuffer); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - result = _mm_mul_ps(result, vScalar); // Scale the results - - _mm_store_ps(outputFloatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / 32768.0; - const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; - *magnitudeVectorPtr++ = (int16_t)(val1Result); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - const float scalar = 32768.0; - for(number = 0; number < num_points; number++){ - float real = ((float)(*complexVectorPtr++)) / scalar; - float imag = ((float)(*complexVectorPtr++)) / scalar; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC_DISABLED -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_16sc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); -static inline void volk_16sc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16sc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h deleted file mode 100644 index c2605d551..000000000 --- a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h +++ /dev/null @@ -1,179 +0,0 @@ -#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H -#define INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_32f_aligned16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - - __m128 cplxValue1, cplxValue2, result; - - float inputFloatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - _mm_store_ps(magnitudeVectorPtr, result); - - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) / scalar; - float val1Imag = (float)(*complexVectorPtr++) / scalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - - __m128 cplxValue1, cplxValue2, result, re, im; - - float inputFloatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); - im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(re, invScalar); - cplxValue2 = _mm_mul_ps(im, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - _mm_store_ps(magnitudeVectorPtr, result); - - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) * iScalar; - float val1Imag = (float)(*complexVectorPtr++) * iScalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} - - -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - float real = ( (float) (*complexVectorPtr++)) * invScalar; - float imag = ( (float) (*complexVectorPtr++)) * invScalar; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC_DISABLED -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_16sc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16sc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16sc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h new file mode 100644 index 000000000..53e4253c4 --- /dev/null +++ b/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h @@ -0,0 +1,108 @@ +#ifndef INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H +#define INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + uint64_t number = 0; + const uint64_t quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +extern void volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h new file mode 100644 index 000000000..7320db368 --- /dev/null +++ b/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h @@ -0,0 +1,125 @@ +#ifndef INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H +#define INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + + for(;number < quarterPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi16_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; + sixteenTComplexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar = 1.0/scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h new file mode 100644 index 000000000..649b5cc96 --- /dev/null +++ b/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h @@ -0,0 +1,179 @@ +#ifndef INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H +#define INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + __m128 cplxValue1, cplxValue2, result; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) / scalar; + float val1Imag = (float)(*complexVectorPtr++) / scalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + + __m128 cplxValue1, cplxValue2, result, re, im; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); + im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(re, invScalar); + cplxValue2 = _mm_mul_ps(im, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) * iScalar; + float val1Imag = (float)(*complexVectorPtr++) * iScalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} + + +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16sc_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + float real = ( (float) (*complexVectorPtr++)) * invScalar; + float imag = ( (float) (*complexVectorPtr++)) * invScalar; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC_DISABLED +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16sc_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16sc_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16sc_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_16u_byteswap_a16.h b/volk/include/volk/volk_16u_byteswap_a16.h new file mode 100644 index 000000000..c8128dbab --- /dev/null +++ b/volk/include/volk/volk_16u_byteswap_a16.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_16u_byteswap_a16_H +#define INCLUDED_volk_16u_byteswap_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_a16_sse2(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + uint16_t* inputPtr = intsToSwap; + __m128i input, left, right, output; + + const unsigned int eighthPoints = num_points / 8; + for(;number < eighthPoints; number++){ + // Load the 16t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + // Do the two shifts + left = _mm_slli_epi16(input, 8); + right = _mm_srli_epi16(input, 8); + // Or the left and right halves together + output = _mm_or_si128(left, right); + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 8; + } + + + // Byteswap any remaining points: + number = eighthPoints*8; + for(; number < num_points; number++){ + uint16_t outputVal = *inputPtr; + outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_16u_byteswap_a16_generic(uint16_t* intsToSwap, unsigned int num_points){ + unsigned int point; + uint16_t* inputPtr = intsToSwap; + for(point = 0; point < num_points; point++){ + uint16_t output = *inputPtr; + output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Byteswaps (in-place) an aligned vector of int16_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +extern void volk_16u_byteswap_a16_orc_impl(uint16_t* intsToSwap, unsigned int num_points); +static inline void volk_16u_byteswap_a16_orc(uint16_t* intsToSwap, unsigned int num_points){ + volk_16u_byteswap_a16_orc_impl(intsToSwap, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h deleted file mode 100644 index 9d19d1a45..000000000 --- a/volk/include/volk/volk_16u_byteswap_aligned16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H -#define INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - -/*! - \brief Byteswaps (in-place) an aligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_16u_byteswap_aligned16_sse2(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - uint16_t* inputPtr = intsToSwap; - __m128i input, left, right, output; - - const unsigned int eighthPoints = num_points / 8; - for(;number < eighthPoints; number++){ - // Load the 16t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - // Do the two shifts - left = _mm_slli_epi16(input, 8); - right = _mm_srli_epi16(input, 8); - // Or the left and right halves together - output = _mm_or_si128(left, right); - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 8; - } - - - // Byteswap any remaining points: - number = eighthPoints*8; - for(; number < num_points; number++){ - uint16_t outputVal = *inputPtr; - outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00)); - *inputPtr = outputVal; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, unsigned int num_points){ - unsigned int point; - uint16_t* inputPtr = intsToSwap; - for(point = 0; point < num_points; point++){ - uint16_t output = *inputPtr; - output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00)); - *inputPtr = output; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Byteswaps (in-place) an aligned vector of int16_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -extern void volk_16u_byteswap_aligned16_orc_impl(uint16_t* intsToSwap, unsigned int num_points); -static inline void volk_16u_byteswap_aligned16_orc(uint16_t* intsToSwap, unsigned int num_points){ - volk_16u_byteswap_aligned16_orc_impl(intsToSwap, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h new file mode 100644 index 000000000..a0f97f94e --- /dev/null +++ b/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h @@ -0,0 +1,151 @@ +#ifndef INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H +#define INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H + +#include +#include +#include + +#ifndef MAX +#define MAX(X,Y) ((X) > (Y)?(X):(Y)) +#endif + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; + + xmm9 = _mm_setzero_ps(); + xmm1 = _mm_setzero_ps(); + + xmm0 = _mm_load1_ps(¢er_point_array[0]); + xmm6 = _mm_load1_ps(¢er_point_array[1]); + xmm7 = _mm_load1_ps(¢er_point_array[2]); + xmm8 = _mm_load1_ps(¢er_point_array[3]); + //xmm11 = _mm_load1_ps(¢er_point_array[4]); + xmm10 = _mm_load1_ps(cutoff); + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 2) & 3; + int i = 0; + + for(; i < bound; ++i) { + xmm2 = _mm_load_ps(src0); + xmm2 = _mm_max_ps(xmm10, xmm2); + xmm3 = _mm_mul_ps(xmm2, xmm2); + xmm4 = _mm_mul_ps(xmm2, xmm3); + xmm5 = _mm_mul_ps(xmm3, xmm3); + //xmm12 = _mm_mul_ps(xmm3, xmm4); + + xmm2 = _mm_mul_ps(xmm2, xmm0); + xmm3 = _mm_mul_ps(xmm3, xmm6); + xmm4 = _mm_mul_ps(xmm4, xmm7); + xmm5 = _mm_mul_ps(xmm5, xmm8); + //xmm12 = _mm_mul_ps(xmm12, xmm11); + + xmm2 = _mm_add_ps(xmm2, xmm3); + xmm3 = _mm_add_ps(xmm4, xmm5); + + src0 += 4; + + xmm9 = _mm_add_ps(xmm2, xmm9); + + xmm1 = _mm_add_ps(xmm3, xmm1); + + //xmm9 = _mm_add_ps(xmm12, xmm9); + } + + xmm2 = _mm_hadd_ps(xmm9, xmm1); + xmm3 = _mm_hadd_ps(xmm2, xmm2); + xmm4 = _mm_hadd_ps(xmm3, xmm3); + + _mm_store_ss(&result, xmm4); + + + + for(i = 0; i < leftovers; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth);// + + //center_point_array[4] * fith); + } + + result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; + + target[0] = result; +} + + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC + +static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + int i = 0; + + for(; i < num_bytes >> 2; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth); //+ + //center_point_array[4] * fith); + /*printf("%f12...%d\n", (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth) + + //center_point_array[4] * fith) + + (center_point_array[4]), i); + */ + } + + result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); + + + + *target = result; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_32f_add_32f_a16.h b/volk/include/volk/volk_32f_32f_add_32f_a16.h new file mode 100644 index 000000000..ba38c310f --- /dev/null +++ b/volk/include/volk/volk_32f_32f_add_32f_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_32f_add_32f_a16_H +#define INCLUDED_volk_32f_32f_add_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_32f_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_32f_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +extern void volk_32f_32f_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_32f_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_32f_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_32f_add_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_divide_32f_a16.h b/volk/include/volk/volk_32f_32f_divide_32f_a16.h new file mode 100644 index 000000000..a0995e631 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_divide_32f_a16.h @@ -0,0 +1,82 @@ +#ifndef INCLUDED_volk_32f_32f_divide_32f_a16_H +#define INCLUDED_volk_32f_32f_divide_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_32f_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_div_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_32f_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +extern void volk_32f_32f_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_32f_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_32f_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_volk_32f_32f_divide_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h new file mode 100644 index 000000000..63f5221d3 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h @@ -0,0 +1,184 @@ +#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_a16_H +#define INCLUDED_volk_32f_32f_dot_prod_32f_a16_H + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32f_32f_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE + + +static inline void volk_32f_32f_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_add_ps(cVal, dotProdVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32f_32f_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32f_32f_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + + bVal1 = _mm_load_ps(bPtr); bPtr += 4; + bVal2 = _mm_load_ps(bPtr); bPtr += 4; + bVal3 = _mm_load_ps(bPtr); bPtr += 4; + bVal4 = _mm_load_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + float dotProductVector[4] __attribute__((aligned(16))); + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h new file mode 100644 index 000000000..b5fa7d7a4 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h @@ -0,0 +1,184 @@ +#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H +#define INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32f_32f_dot_prod_32f_ua16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE + + +static inline void volk_32f_32f_dot_prod_32f_ua16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_add_ps(cVal, dotProdVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32f_32f_dot_prod_32f_ua16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32f_32f_dot_prod_32f_ua16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; + + bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + float dotProductVector[4] __attribute__((aligned(16))); + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H*/ diff --git a/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h b/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h new file mode 100644 index 000000000..34ea93349 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32f_32f_interleave_32fc_a16_H +#define INCLUDED_volk_32f_32f_interleave_32fc_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Interleaves the I & Q vector data into the complex vector + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_32f_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + unsigned int number = 0; + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + const uint64_t quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Interleaves the I & Q vector data into the complex vector. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_32f_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_32f_interleave_32fc_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_max_32f_a16.h b/volk/include/volk/volk_32f_32f_max_32f_a16.h new file mode 100644 index 000000000..8ca7a5ba8 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_max_32f_a16.h @@ -0,0 +1,85 @@ +#ifndef INCLUDED_volk_32f_32f_max_32f_a16_H +#define INCLUDED_volk_32f_32f_max_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_32f_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_max_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_32f_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_32f_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_32f_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_32f_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_32f_max_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_min_32f_a16.h b/volk/include/volk/volk_32f_32f_min_32f_a16.h new file mode 100644 index 000000000..dd05988be --- /dev/null +++ b/volk/include/volk/volk_32f_32f_min_32f_a16.h @@ -0,0 +1,85 @@ +#ifndef INCLUDED_volk_32f_32f_min_32f_a16_H +#define INCLUDED_volk_32f_32f_min_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_32f_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_min_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_32f_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_32f_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_32f_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_32f_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_32f_min_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_multiply_32f_a16.h b/volk/include/volk/volk_32f_32f_multiply_32f_a16.h new file mode 100644 index 000000000..2d004db10 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_multiply_32f_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_32f_multiply_32f_a16_H +#define INCLUDED_volk_32f_32f_multiply_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_32f_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_32f_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +extern void volk_32f_32f_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_32f_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_32f_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_32f_multiply_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h b/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h new file mode 100644 index 000000000..207382a19 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h @@ -0,0 +1,155 @@ +#ifndef INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H +#define INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue1, cplxValue2; + __m128i intValue1, intValue2; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); + cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); + cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); + + intValue1 = _mm_cvtps_epi32(cplxValue1); + intValue2 = _mm_cvtps_epi32(cplxValue2); + + intValue1 = _mm_packs_epi32(intValue1, intValue2); + + _mm_store_si128((__m128i*)complexVectorPtr, intValue1); + complexVectorPtr += 8; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_32f_s32f_interleave_16sc_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + int16_t* complexVectorPtr = (int16_t*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_subtract_32f_a16.h b/volk/include/volk/volk_32f_32f_subtract_32f_a16.h new file mode 100644 index 000000000..9fea6aa27 --- /dev/null +++ b/volk/include/volk/volk_32f_32f_subtract_32f_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_32f_subtract_32f_a16_H +#define INCLUDED_volk_32f_32f_subtract_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_32f_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_sub_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_32f_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +extern void volk_32f_32f_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_32f_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_32f_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_32f_subtract_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_accumulator_aligned16.h b/volk/include/volk/volk_32f_accumulator_aligned16.h deleted file mode 100644 index 7e395cf50..000000000 --- a/volk/include/volk/volk_32f_accumulator_aligned16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H -#define INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Accumulates the values in the input buffer - \param result The accumulated result - \param inputBuffer The buffer of data to be accumulated - \param num_points The number of values in inputBuffer to be accumulated -*/ -static inline void volk_32f_accumulator_aligned16_sse(float* result, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - float tempBuffer[4] __attribute__((aligned(128))); - - __m128 accumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - accumulator = _mm_add_ps(accumulator, aVal); - aPtr += 4; - } - _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container - returnValue = tempBuffer[0]; - returnValue += tempBuffer[1]; - returnValue += tempBuffer[2]; - returnValue += tempBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Accumulates the values in the input buffer - \param result The accumulated result - \param inputBuffer The buffer of data to be accumulated - \param num_points The number of values in inputBuffer to be accumulated -*/ -static inline void volk_32f_accumulator_aligned16_generic(float* result, const float* inputBuffer, unsigned int num_points){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - float returnValue = 0; - - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_accumulator_s32f_a16.h b/volk/include/volk/volk_32f_accumulator_s32f_a16.h new file mode 100644 index 000000000..4a3588e6d --- /dev/null +++ b/volk/include/volk/volk_32f_accumulator_s32f_a16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_volk_32f_accumulator_s32f_a16_H +#define INCLUDED_volk_32f_accumulator_s32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_32f_accumulator_s32f_a16_sse(float* result, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + float tempBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + aVal = _mm_load_ps(aPtr); + accumulator = _mm_add_ps(accumulator, aVal); + aPtr += 4; + } + _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container + returnValue = tempBuffer[0]; + returnValue += tempBuffer[1]; + returnValue += tempBuffer[2]; + returnValue += tempBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_32f_accumulator_s32f_a16_generic(float* result, const float* inputBuffer, unsigned int num_points){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + float returnValue = 0; + + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_accumulator_s32f_a16_H */ diff --git a/volk/include/volk/volk_32f_add_aligned16.h b/volk/include/volk/volk_32f_add_aligned16.h deleted file mode 100644 index e7d8de265..000000000 --- a/volk/include/volk/volk_32f_add_aligned16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_ADD_ALIGNED16_H -#define INCLUDED_VOLK_32f_ADD_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_add_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_add_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_add_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -extern void volk_32f_add_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_add_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_add_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32f_ADD_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h b/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h new file mode 100644 index 000000000..fce77cd04 --- /dev/null +++ b/volk/include/volk/volk_32f_calc_spectral_noise_floor_a16.h @@ -0,0 +1,167 @@ +#ifndef INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H +#define INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_calc_spectral_noise_floor_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* dataPointsPtr = realDataPoints; + float avgPointsVector[4] __attribute__((aligned(128))); + + __m128 dataPointsVal; + __m128 avgPointsVal = _mm_setzero_ps(); + // Calculate the sum (for mean) for all points + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); + } + + _mm_store_ps(avgPointsVector, avgPointsVal); + + float sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more + const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; + + dataPointsPtr = realDataPoints; // Reset the dataPointsPtr + __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); + __m128 vOnesVector = _mm_set_ps1(1.0); + __m128 vValidBinCount = _mm_setzero_ps(); + avgPointsVal = _mm_setzero_ps(); + __m128 compareMask; + number = 0; + // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + // Identify which items do not exceed the mean amplitude + compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); + + // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude + avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); + + // Count the number of bins which do not exceed the mean amplitude + vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); + } + + // Calculate the mean from the remaining data points + _mm_store_ps(avgPointsVector, avgPointsVal); + + sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + // Calculate the number of valid bins from the remaning count + float validBinCountVector[4] __attribute__((aligned(128))); + _mm_store_ps(validBinCountVector, vValidBinCount); + + float validBinCount = 0; + validBinCount += validBinCountVector[0]; + validBinCount += validBinCountVector[1]; + validBinCount += validBinCountVector[2]; + validBinCount += validBinCountVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(realDataPoints[number] <= meanAmplitude){ + sumMean += realDataPoints[number]; + validBinCount += 1.0; + } + } + + float localNoiseFloorAmplitude = 0; + if(validBinCount > 0.0){ + localNoiseFloorAmplitude = sumMean / validBinCount; + } + else{ + localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... + } + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_calc_spectral_noise_floor_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + float sumMean = 0.0; + unsigned int number; + // find the sum (for mean), etc + for(number = 0; number < num_points; number++){ + // sum (for mean) + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more) + const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; + + // now throw out any bins higher than the mean + sumMean = 0.0; + unsigned int newNumDataPoints = num_points; + for(number = 0; number < num_points; number++){ + if (realDataPoints[number] <= meanAmplitude) + sumMean += realDataPoints[number]; + else + newNumDataPoints--; + } + + float localNoiseFloorAmplitude = 0.0; + if (newNumDataPoints == 0) // in the odd case that all + localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! + else + localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H */ diff --git a/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h b/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h deleted file mode 100644 index ff917525f..000000000 --- a/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h +++ /dev/null @@ -1,167 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H -#define INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the spectral noise floor of an input power spectrum - - Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. - - \param realDataPoints The input power spectrum - \param num_points The number of data points in the input power spectrum vector - \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 - \param noiseFloorAmplitude The noise floor of the input spectrum, in dB -*/ -static inline void volk_32f_calc_spectral_noise_floor_aligned16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* dataPointsPtr = realDataPoints; - float avgPointsVector[4] __attribute__((aligned(128))); - - __m128 dataPointsVal; - __m128 avgPointsVal = _mm_setzero_ps(); - // Calculate the sum (for mean) for all points - for(; number < quarterPoints; number++){ - - dataPointsVal = _mm_load_ps(dataPointsPtr); - - dataPointsPtr += 4; - - avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); - } - - _mm_store_ps(avgPointsVector, avgPointsVal); - - float sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more - const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; - - dataPointsPtr = realDataPoints; // Reset the dataPointsPtr - __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); - __m128 vOnesVector = _mm_set_ps1(1.0); - __m128 vValidBinCount = _mm_setzero_ps(); - avgPointsVal = _mm_setzero_ps(); - __m128 compareMask; - number = 0; - // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude - for(; number < quarterPoints; number++){ - - dataPointsVal = _mm_load_ps(dataPointsPtr); - - dataPointsPtr += 4; - - // Identify which items do not exceed the mean amplitude - compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); - - // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude - avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); - - // Count the number of bins which do not exceed the mean amplitude - vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); - } - - // Calculate the mean from the remaining data points - _mm_store_ps(avgPointsVector, avgPointsVal); - - sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - - // Calculate the number of valid bins from the remaning count - float validBinCountVector[4] __attribute__((aligned(128))); - _mm_store_ps(validBinCountVector, vValidBinCount); - - float validBinCount = 0; - validBinCount += validBinCountVector[0]; - validBinCount += validBinCountVector[1]; - validBinCount += validBinCountVector[2]; - validBinCount += validBinCountVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(realDataPoints[number] <= meanAmplitude){ - sumMean += realDataPoints[number]; - validBinCount += 1.0; - } - } - - float localNoiseFloorAmplitude = 0; - if(validBinCount > 0.0){ - localNoiseFloorAmplitude = sumMean / validBinCount; - } - else{ - localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... - } - - *noiseFloorAmplitude = localNoiseFloorAmplitude; -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the spectral noise floor of an input power spectrum - - Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. - - \param realDataPoints The input power spectrum - \param num_points The number of data points in the input power spectrum vector - \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 - \param noiseFloorAmplitude The noise floor of the input spectrum, in dB -*/ -static inline void volk_32f_calc_spectral_noise_floor_aligned16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ - float sumMean = 0.0; - unsigned int number; - // find the sum (for mean), etc - for(number = 0; number < num_points; number++){ - // sum (for mean) - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more) - const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; - - // now throw out any bins higher than the mean - sumMean = 0.0; - unsigned int newNumDataPoints = num_points; - for(number = 0; number < num_points; number++){ - if (realDataPoints[number] <= meanAmplitude) - sumMean += realDataPoints[number]; - else - newNumDataPoints--; - } - - float localNoiseFloorAmplitude = 0.0; - if (newNumDataPoints == 0) // in the odd case that all - localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! - else - localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); - - *noiseFloorAmplitude = localNoiseFloorAmplitude; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_16s_aligned16.h b/volk/include/volk/volk_32f_convert_16s_aligned16.h deleted file mode 100644 index 7fbabd9c3..000000000 --- a/volk/include/volk/volk_32f_convert_16s_aligned16.h +++ /dev/null @@ -1,110 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_16s_aligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_16s_aligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_16s_aligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_16s_unaligned16.h b/volk/include/volk/volk_32f_convert_16s_unaligned16.h deleted file mode 100644 index d2bbdf13a..000000000 --- a/volk/include/volk/volk_32f_convert_16s_unaligned16.h +++ /dev/null @@ -1,113 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_16s_unaligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_16s_unaligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_16s_unaligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_32s_aligned16.h b/volk/include/volk/volk_32f_convert_32s_aligned16.h deleted file mode 100644 index 011ef5d0e..000000000 --- a/volk/include/volk/volk_32f_convert_32s_aligned16.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_32s_aligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1; - __m128i intInputVal1; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_32s_aligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_32s_aligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int32_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_32s_unaligned16.h b/volk/include/volk/volk_32f_convert_32s_unaligned16.h deleted file mode 100644 index a6df826c7..000000000 --- a/volk/include/volk/volk_32f_convert_32s_unaligned16.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_32s_unaligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1; - __m128i intInputVal1; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_32s_unaligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_32s_unaligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int32_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_a16.h b/volk/include/volk/volk_32f_convert_64f_a16.h new file mode 100644 index 000000000..c303dc118 --- /dev/null +++ b/volk/include/volk/volk_32f_convert_64f_a16.h @@ -0,0 +1,70 @@ +#ifndef INCLUDED_volk_32f_convert_64f_a16_H +#define INCLUDED_volk_32f_convert_64f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_a16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_store_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_a16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_convert_64f_a16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_aligned16.h b/volk/include/volk/volk_32f_convert_64f_aligned16.h deleted file mode 100644 index 91a855813..000000000 --- a/volk/include/volk/volk_32f_convert_64f_aligned16.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_32f_convert_64f_aligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m128d ret; - __m128 inputVal; - - for(;number < quarterPoints; number++){ - inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - ret = _mm_cvtps_pd(inputVal); - - _mm_store_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - - inputVal = _mm_movehl_ps(inputVal, inputVal); - - ret = _mm_cvtps_pd(inputVal); - - _mm_store_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_32f_convert_64f_aligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ - double* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((double)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_ua16.h b/volk/include/volk/volk_32f_convert_64f_ua16.h new file mode 100644 index 000000000..c8de768dc --- /dev/null +++ b/volk/include/volk/volk_32f_convert_64f_ua16.h @@ -0,0 +1,70 @@ +#ifndef INCLUDED_volk_32f_convert_64f_ua16_H +#define INCLUDED_volk_32f_convert_64f_ua16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_ua16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_ua16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_convert_64f_ua16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_unaligned16.h b/volk/include/volk/volk_32f_convert_64f_unaligned16.h deleted file mode 100644 index 698e0d446..000000000 --- a/volk/include/volk/volk_32f_convert_64f_unaligned16.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_32f_convert_64f_unaligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m128d ret; - __m128 inputVal; - - for(;number < quarterPoints; number++){ - inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - ret = _mm_cvtps_pd(inputVal); - - _mm_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - - inputVal = _mm_movehl_ps(inputVal, inputVal); - - ret = _mm_cvtps_pd(inputVal); - - _mm_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_32f_convert_64f_unaligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ - double* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((double)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_8s_aligned16.h b/volk/include/volk/volk_32f_convert_8s_aligned16.h deleted file mode 100644 index b9487b622..000000000 --- a/volk/include/volk/volk_32f_convert_8s_aligned16.h +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_8s_aligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); - - intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_8s_aligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_convert_8s_aligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_convert_8s_unaligned16.h b/volk/include/volk/volk_32f_convert_8s_unaligned16.h deleted file mode 100644 index e986dbc87..000000000 --- a/volk/include/volk/volk_32f_convert_8s_unaligned16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H -#define INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_8s_unaligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); - - intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_8s_unaligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_convert_8s_unaligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_divide_aligned16.h b/volk/include/volk/volk_32f_divide_aligned16.h deleted file mode 100644 index c595b5e92..000000000 --- a/volk/include/volk/volk_32f_divide_aligned16.h +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H -#define INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -static inline void volk_32f_divide_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_div_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -static inline void volk_32f_divide_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -extern void volk_32f_divide_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_divide_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_divide_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_dot_prod_aligned16.h b/volk/include/volk/volk_32f_dot_prod_aligned16.h deleted file mode 100644 index 3aee1136a..000000000 --- a/volk/include/volk/volk_32f_dot_prod_aligned16.h +++ /dev/null @@ -1,184 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H -#define INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H - -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32f_dot_prod_aligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE - - -static inline void volk_32f_dot_prod_aligned16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_SSE*/ - -#if LV_HAVE_SSE3 - -#include - -static inline void volk_32f_dot_prod_aligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_SSE4_1 - -#include - -static inline void volk_32f_dot_prod_aligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - - bVal1 = _mm_load_ps(bPtr); bPtr += 4; - bVal2 = _mm_load_ps(bPtr); bPtr += 4; - bVal3 = _mm_load_ps(bPtr); bPtr += 4; - bVal4 = _mm_load_ps(bPtr); bPtr += 4; - - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } - - float dotProductVector[4] __attribute__((aligned(16))); - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32f_dot_prod_unaligned16.h b/volk/include/volk/volk_32f_dot_prod_unaligned16.h deleted file mode 100644 index bce6aa15f..000000000 --- a/volk/include/volk/volk_32f_dot_prod_unaligned16.h +++ /dev/null @@ -1,184 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H -#define INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H - -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32f_dot_prod_unaligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE - - -static inline void volk_32f_dot_prod_unaligned16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_SSE*/ - -#if LV_HAVE_SSE3 - -#include - -static inline void volk_32f_dot_prod_unaligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_SSE4_1 - -#include - -static inline void volk_32f_dot_prod_unaligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; - - bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; - - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } - - float dotProductVector[4] __attribute__((aligned(16))); - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H*/ diff --git a/volk/include/volk/volk_32f_fm_detect_aligned16.h b/volk/include/volk/volk_32f_fm_detect_aligned16.h deleted file mode 100644 index c82239d74..000000000 --- a/volk/include/volk/volk_32f_fm_detect_aligned16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H -#define INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) - \param bound The interval that the input phase data is in, which is used to modulo the differentiation - \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. - \param num_noints The number of real values in the input vector. -*/ -static inline void volk_32f_fm_detect_aligned16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ - if (num_points < 1) { - return; - } - unsigned int number = 1; - unsigned int j = 0; - // num_points-1 keeps Fedora 7's gcc from crashing... - // num_points won't work. :( - const unsigned int quarterPoints = (num_points-1) / 4; - - float* outPtr = outputVector; - const float* inPtr = inputVector; - __m128 upperBound = _mm_set_ps1(bound); - __m128 lowerBound = _mm_set_ps1(-bound); - __m128 next3old1; - __m128 next4; - __m128 boundAdjust; - __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above. - __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below. - // Do the first 4 by hand since we're going in from the saveValue: - *outPtr = *inPtr - *saveValue; - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - for (; number < quarterPoints; number++) { - // Load data - next3old1 = _mm_loadu_ps((float*) (inPtr-1)); - next4 = _mm_load_ps(inPtr); - inPtr += 4; - // Subtract and store: - next3old1 = _mm_sub_ps(next4, next3old1); - // Bound: - boundAdjust = _mm_cmpgt_ps(next3old1, upperBound); - boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust); - next4 = _mm_cmplt_ps(next3old1, lowerBound); - next4 = _mm_and_ps(next4, negBoundAdjust); - boundAdjust = _mm_or_ps(next4, boundAdjust); - // Make sure we're in the bounding interval: - next3old1 = _mm_add_ps(next3old1, boundAdjust); - _mm_store_ps(outPtr,next3old1); // Store the results back into the output - outPtr += 4; - } - - for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - *saveValue = inputVector[num_points-1]; -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) - \param bound The interval that the input phase data is in, which is used to modulo the differentiation - \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. - \param num_points The number of real values in the input vector. -*/ -static inline void volk_32f_fm_detect_aligned16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ - if (num_points < 1) { - return; - } - unsigned int number = 0; - float* outPtr = outputVector; - const float* inPtr = inputVector; - - // Do the first 1 by hand since we're going in from the saveValue: - *outPtr = *inPtr - *saveValue; - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - - for (number = 1; number < num_points; number++) { - *outPtr = *(inPtr) - *(inPtr-1); - if (*outPtr > bound) *outPtr -= 2*bound; - if (*outPtr < -bound) *outPtr += 2*bound; - inPtr++; - outPtr++; - } - - *saveValue = inputVector[num_points-1]; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_index_max_16u_a16.h b/volk/include/volk/volk_32f_index_max_16u_a16.h new file mode 100644 index 000000000..d070e17d5 --- /dev/null +++ b/volk/include/volk/volk_32f_index_max_16u_a16.h @@ -0,0 +1,148 @@ +#ifndef INCLUDED_volk_32f_index_max_16u_a16_H +#define INCLUDED_volk_32f_index_max_16u_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include + +static inline void volk_32f_index_max_16u_a16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + float maxValuesBuffer[4] __attribute__((aligned(16))); + float maxIndexesBuffer[4] __attribute__((aligned(16))); + + for(;number < quarterPoints; number++){ + + currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + + maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); + maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++){ + if(maxValuesBuffer[number] > max){ + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(src0[number] > max){ + index = number; + max = src0[number]; + } + } + target[0] = (unsigned int)index; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + +#if LV_HAVE_SSE +#include + +static inline void volk_32f_index_max_16u_a16_sse(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + float maxValuesBuffer[4] __attribute__((aligned(16))); + float maxIndexesBuffer[4] __attribute__((aligned(16))); + + for(;number < quarterPoints; number++){ + + currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); + + maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++){ + if(maxValuesBuffer[number] > max){ + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(src0[number] > max){ + index = number; + max = src0[number]; + } + } + target[0] = (unsigned int)index; + } +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_GENERIC +static inline void volk_32f_index_max_16u_a16_generic(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + float max = src0[0]; + unsigned int index = 0; + + int i = 1; + + for(; i < num_points; ++i) { + + if(src0[i] > max){ + index = i; + max = src0[i]; + } + + } + target[0] = index; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32f_index_max_16u_a16_H*/ diff --git a/volk/include/volk/volk_32f_index_max_aligned16.h b/volk/include/volk/volk_32f_index_max_aligned16.h deleted file mode 100644 index 26322bfa2..000000000 --- a/volk/include/volk/volk_32f_index_max_aligned16.h +++ /dev/null @@ -1,148 +0,0 @@ -#ifndef INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H -#define INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include - -static inline void volk_32f_index_max_aligned16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* inputPtr = (float*)src0; - - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); - - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; - - float maxValuesBuffer[4] __attribute__((aligned(16))); - float maxIndexesBuffer[4] __attribute__((aligned(16))); - - for(;number < quarterPoints; number++){ - - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - - compareResults = _mm_cmpgt_ps(maxValues, currentValues); - - maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); - maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); - } - - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 4; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; - } - } - target[0] = (unsigned int)index; - } -} - -#endif /*LV_HAVE_SSE4_1*/ - -#if LV_HAVE_SSE -#include - -static inline void volk_32f_index_max_aligned16_sse(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* inputPtr = (float*)src0; - - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); - - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; - - float maxValuesBuffer[4] __attribute__((aligned(16))); - float maxIndexesBuffer[4] __attribute__((aligned(16))); - - for(;number < quarterPoints; number++){ - - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - - compareResults = _mm_cmpgt_ps(maxValues, currentValues); - - maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); - - maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); - } - - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 4; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; - } - } - target[0] = (unsigned int)index; - } -} - -#endif /*LV_HAVE_SSE*/ - -#if LV_HAVE_GENERIC -static inline void volk_32f_index_max_aligned16_generic(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - float max = src0[0]; - unsigned int index = 0; - - int i = 1; - - for(; i < num_points; ++i) { - - if(src0[i] > max){ - index = i; - max = src0[i]; - } - - } - target[0] = index; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32f_interleave_16sc_aligned16.h b/volk/include/volk/volk_32f_interleave_16sc_aligned16.h deleted file mode 100644 index 476946b88..000000000 --- a/volk/include/volk/volk_32f_interleave_16sc_aligned16.h +++ /dev/null @@ -1,155 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H -#define INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_interleave_16sc_aligned16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - const unsigned int quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue1, cplxValue2; - __m128i intValue1, intValue2; - - int16_t* complexVectorPtr = (int16_t*)complexVector; - - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue1 = _mm_unpacklo_ps(iValue, qValue); - cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue2 = _mm_unpackhi_ps(iValue, qValue); - cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); - - intValue1 = _mm_cvtps_epi32(cplxValue1); - intValue2 = _mm_cvtps_epi32(cplxValue2); - - intValue1 = _mm_packs_epi32(intValue1, intValue2); - - _mm_store_si128((__m128i*)complexVectorPtr, intValue1); - complexVectorPtr += 8; - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } - -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_interleave_16sc_aligned16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - const unsigned int quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue; - - int16_t* complexVectorPtr = (int16_t*)complexVector; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue = _mm_unpacklo_ps(iValue, qValue); - cplxValue = _mm_mul_ps(cplxValue, vScalar); - - _mm_store_ps(floatBuffer, cplxValue); - - *complexVectorPtr++ = (int16_t)(floatBuffer[0]); - *complexVectorPtr++ = (int16_t)(floatBuffer[1]); - *complexVectorPtr++ = (int16_t)(floatBuffer[2]); - *complexVectorPtr++ = (int16_t)(floatBuffer[3]); - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue = _mm_unpackhi_ps(iValue, qValue); - cplxValue = _mm_mul_ps(cplxValue, vScalar); - - _mm_store_ps(floatBuffer, cplxValue); - - *complexVectorPtr++ = (int16_t)(floatBuffer[0]); - *complexVectorPtr++ = (int16_t)(floatBuffer[1]); - *complexVectorPtr++ = (int16_t)(floatBuffer[2]); - *complexVectorPtr++ = (int16_t)(floatBuffer[3]); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } - -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_interleave_16sc_aligned16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - int16_t* complexVectorPtr = (int16_t*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_interleave_32fc_aligned16.h b/volk/include/volk/volk_32f_interleave_32fc_aligned16.h deleted file mode 100644 index 859c6a0ef..000000000 --- a/volk/include/volk/volk_32f_interleave_32fc_aligned16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H -#define INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Interleaves the I & Q vector data into the complex vector - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param num_points The number of complex data values to be interleaved -*/ -static inline void volk_32f_interleave_32fc_aligned16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ - unsigned int number = 0; - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - const uint64_t quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue; - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue = _mm_unpacklo_ps(iValue, qValue); - _mm_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 4; - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue = _mm_unpackhi_ps(iValue, qValue); - _mm_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 4; - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Interleaves the I & Q vector data into the complex vector. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param num_points The number of complex data values to be interleaved -*/ -static inline void volk_32f_interleave_32fc_aligned16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - unsigned int number; - - for(number = 0; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_max_aligned16.h b/volk/include/volk/volk_32f_max_aligned16.h deleted file mode 100644 index d4e30fba8..000000000 --- a/volk/include/volk/volk_32f_max_aligned16.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_MAX_ALIGNED16_H -#define INCLUDED_VOLK_32f_MAX_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_max_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_max_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_max_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -extern void volk_32f_max_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_max_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_max_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32f_MAX_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_min_aligned16.h b/volk/include/volk/volk_32f_min_aligned16.h deleted file mode 100644 index 55daafb6a..000000000 --- a/volk/include/volk/volk_32f_min_aligned16.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_MIN_ALIGNED16_H -#define INCLUDED_VOLK_32f_MIN_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_min_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_min_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_min_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -extern void volk_32f_min_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_min_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_min_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32f_MIN_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_multiply_aligned16.h b/volk/include/volk/volk_32f_multiply_aligned16.h deleted file mode 100644 index 87ae7bcf8..000000000 --- a/volk/include/volk/volk_32f_multiply_aligned16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H -#define INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_multiply_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_multiply_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -extern void volk_32f_multiply_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_multiply_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h deleted file mode 100644 index 323d0513c..000000000 --- a/volk/include/volk/volk_32f_normalize_aligned16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H -#define INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value ) - \param vecBuffer The buffer of values to be vectorized - \param num_points The number of values in vecBuffer - \param scalar The scale value to be applied to each buffer value -*/ -static inline void volk_32f_normalize_aligned16_sse(float* vecBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* inputPtr = vecBuffer; - - const float invScalar = 1.0 / scalar; - __m128 vecScalar = _mm_set_ps1(invScalar); - - __m128 input1; - - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - - input1 = _mm_load_ps(inputPtr); - - input1 = _mm_mul_ps(input1, vecScalar); - - _mm_store_ps(inputPtr, input1); - - inputPtr += 4; - } - - number = quarterPoints*4; - for(; number < num_points; number++){ - *inputPtr *= invScalar; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Normalizes the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be normalizeed - \param bVector One of the vectors to be normalizeed - \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector -*/ -static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* inputPtr = vecBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *inputPtr *= invScalar; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Normalizes the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be normalizeed - \param bVector One of the vectors to be normalizeed - \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector -*/ -extern void volk_32f_normalize_aligned16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); -static inline void volk_32f_normalize_aligned16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ - float invscalar = 1.0 / scalar; - volk_32f_normalize_aligned16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_power_aligned16.h b/volk/include/volk/volk_32f_power_aligned16.h deleted file mode 100644 index 2ecd8eecb..000000000 --- a/volk/include/volk/volk_32f_power_aligned16.h +++ /dev/null @@ -1,144 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_POWER_ALIGNED16_H -#define INCLUDED_VOLK_32f_POWER_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Takes each the input vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector -*/ -static inline void volk_32f_power_aligned16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - -#if LV_HAVE_LIB_SIMDMATH - __m128 vPower = _mm_set_ps1(power); - __m128 zeroValue = _mm_setzero_ps(); - __m128 signMask; - __m128 negatedValues; - __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); - __m128 onesMask = _mm_set_ps1(1); - - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - signMask = _mm_cmplt_ps(aVal, zeroValue); - negatedValues = _mm_sub_ps(zeroValue, aVal); - aVal = _mm_blendv_ps(aVal, negatedValues, signMask); - - // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after - cVal = powf4(aVal, vPower); // Takes each input value to the specified power - - cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - - for(;number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Takes each the input vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector -*/ -static inline void volk_32f_power_aligned16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - -#if LV_HAVE_LIB_SIMDMATH - __m128 vPower = _mm_set_ps1(power); - __m128 zeroValue = _mm_setzero_ps(); - __m128 signMask; - __m128 negatedValues; - __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); - __m128 onesMask = _mm_set_ps1(1); - - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - signMask = _mm_cmplt_ps(aVal, zeroValue); - negatedValues = _mm_sub_ps(zeroValue, aVal); - aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) ); - - // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after - cVal = powf4(aVal, vPower); // Takes each input value to the specified power - - cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - - for(;number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Takes each the input vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector - */ -static inline void volk_32f_power_aligned16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = powf((*aPtr++), power); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_POWER_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h new file mode 100644 index 000000000..ff4d5b19c --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_32f_fm_detect_32f_a16.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H +#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) + \param bound The interval that the input phase data is in, which is used to modulo the differentiation + \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. + \param num_noints The number of real values in the input vector. +*/ +static inline void volk_32f_s32f_32f_fm_detect_32f_a16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ + if (num_points < 1) { + return; + } + unsigned int number = 1; + unsigned int j = 0; + // num_points-1 keeps Fedora 7's gcc from crashing... + // num_points won't work. :( + const unsigned int quarterPoints = (num_points-1) / 4; + + float* outPtr = outputVector; + const float* inPtr = inputVector; + __m128 upperBound = _mm_set_ps1(bound); + __m128 lowerBound = _mm_set_ps1(-bound); + __m128 next3old1; + __m128 next4; + __m128 boundAdjust; + __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above. + __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below. + // Do the first 4 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + for (; number < quarterPoints; number++) { + // Load data + next3old1 = _mm_loadu_ps((float*) (inPtr-1)); + next4 = _mm_load_ps(inPtr); + inPtr += 4; + // Subtract and store: + next3old1 = _mm_sub_ps(next4, next3old1); + // Bound: + boundAdjust = _mm_cmpgt_ps(next3old1, upperBound); + boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust); + next4 = _mm_cmplt_ps(next3old1, lowerBound); + next4 = _mm_and_ps(next4, negBoundAdjust); + boundAdjust = _mm_or_ps(next4, boundAdjust); + // Make sure we're in the bounding interval: + next3old1 = _mm_add_ps(next3old1, boundAdjust); + _mm_store_ps(outPtr,next3old1); // Store the results back into the output + outPtr += 4; + } + + for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points-1]; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] ) + \param bound The interval that the input phase data is in, which is used to modulo the differentiation + \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample. + \param num_points The number of real values in the input vector. +*/ +static inline void volk_32f_s32f_32f_fm_detect_32f_a16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){ + if (num_points < 1) { + return; + } + unsigned int number = 0; + float* outPtr = outputVector; + const float* inPtr = inputVector; + + // Do the first 1 by hand since we're going in from the saveValue: + *outPtr = *inPtr - *saveValue; + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + + for (number = 1; number < num_points; number++) { + *outPtr = *(inPtr) - *(inPtr-1); + if (*outPtr > bound) *outPtr -= 2*bound; + if (*outPtr < -bound) *outPtr += 2*bound; + inPtr++; + outPtr++; + } + + *saveValue = inputVector[num_points-1]; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16s_a16.h b/volk/include/volk/volk_32f_s32f_convert_16s_a16.h new file mode 100644 index 000000000..cf51cf9c5 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_16s_a16.h @@ -0,0 +1,110 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_16s_a16_H +#define INCLUDED_volk_32f_s32f_convert_16s_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16s_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16s_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16s_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_16s_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h new file mode 100644 index 000000000..53d159f82 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h @@ -0,0 +1,113 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_16s_ua16_H +#define INCLUDED_volk_32f_s32f_convert_16s_ua16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_16s_ua16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_16s_ua16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_16s_ua16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_16s_ua16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32s_a16.h b/volk/include/volk/volk_32f_s32f_convert_32s_a16.h new file mode 100644 index 000000000..0be649418 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_32s_a16.h @@ -0,0 +1,106 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_32s_a16_H +#define INCLUDED_volk_32f_s32f_convert_32s_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32s_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32s_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32s_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_32s_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h new file mode 100644 index 000000000..efb2c3a20 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h @@ -0,0 +1,109 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_32s_ua16_H +#define INCLUDED_volk_32f_s32f_convert_32s_ua16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32s_ua16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32s_ua16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32s_ua16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_32s_ua16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8s_a16.h b/volk/include/volk/volk_32f_s32f_convert_8s_a16.h new file mode 100644 index 000000000..69ccec5c6 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_8s_a16.h @@ -0,0 +1,117 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_8s_a16_H +#define INCLUDED_volk_32f_s32f_convert_8s_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8s_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); + intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8s_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8s_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_8s_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h new file mode 100644 index 000000000..af1652b19 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_8s_ua16_H +#define INCLUDED_volk_32f_s32f_convert_8s_ua16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_8s_ua16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); + intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_8s_ua16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_8s_ua16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_8s_ua16_H */ diff --git a/volk/include/volk/volk_32f_s32f_normalize_a16.h b/volk/include/volk/volk_32f_s32f_normalize_a16.h new file mode 100644 index 000000000..0850cddf7 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_normalize_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_s32f_normalize_a16_H +#define INCLUDED_volk_32f_s32f_normalize_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value ) + \param vecBuffer The buffer of values to be vectorized + \param num_points The number of values in vecBuffer + \param scalar The scale value to be applied to each buffer value +*/ +static inline void volk_32f_s32f_normalize_a16_sse(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + + const float invScalar = 1.0 / scalar; + __m128 vecScalar = _mm_set_ps1(invScalar); + + __m128 input1; + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + + input1 = _mm_load_ps(inputPtr); + + input1 = _mm_mul_ps(input1, vecScalar); + + _mm_store_ps(inputPtr, input1); + + inputPtr += 4; + } + + number = quarterPoints*4; + for(; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Normalizes the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be normalizeed + \param bVector One of the vectors to be normalizeed + \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector +*/ +static inline void volk_32f_s32f_normalize_a16_generic(float* vecBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* inputPtr = vecBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *inputPtr *= invScalar; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Normalizes the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be normalizeed + \param bVector One of the vectors to be normalizeed + \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector +*/ +extern void volk_32f_s32f_normalize_a16_orc_impl(float* dst, float* src, const float scalar, unsigned int num_points); +static inline void volk_32f_s32f_normalize_a16_orc(float* vecBuffer, const float scalar, unsigned int num_points){ + float invscalar = 1.0 / scalar; + volk_32f_s32f_normalize_a16_orc_impl(vecBuffer, vecBuffer, invscalar, num_points); +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_normalize_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_power_32f_a16.h b/volk/include/volk/volk_32f_s32f_power_32f_a16.h new file mode 100644 index 000000000..3ed594d9a --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_power_32f_a16.h @@ -0,0 +1,144 @@ +#ifndef INCLUDED_volk_32f_s32f_power_32f_a16_H +#define INCLUDED_volk_32f_s32f_power_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32f_s32f_power_32f_a16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + +#if LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + __m128 zeroValue = _mm_setzero_ps(); + __m128 signMask; + __m128 negatedValues; + __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); + __m128 onesMask = _mm_set_ps1(1); + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + signMask = _mm_cmplt_ps(aVal, zeroValue); + negatedValues = _mm_sub_ps(zeroValue, aVal); + aVal = _mm_blendv_ps(aVal, negatedValues, signMask); + + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after + cVal = powf4(aVal, vPower); // Takes each input value to the specified power + + cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + for(;number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32f_s32f_power_32f_a16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + +#if LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + __m128 zeroValue = _mm_setzero_ps(); + __m128 signMask; + __m128 negatedValues; + __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power)); + __m128 onesMask = _mm_set_ps1(1); + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + signMask = _mm_cmplt_ps(aVal, zeroValue); + negatedValues = _mm_sub_ps(zeroValue, aVal); + aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) ); + + // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after + cVal = powf4(aVal, vPower); // Takes each input value to the specified power + + cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + for(;number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Takes each the input vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector + */ +static inline void volk_32f_s32f_power_32f_a16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = powf((*aPtr++), power); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_power_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h new file mode 100644 index 000000000..32f4fa067 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_stddev_32f_a16.h @@ -0,0 +1,144 @@ +#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a16_H +#define INCLUDED_volk_32f_s32f_stddev_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_s32f_stddev_32f_a16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for(;number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_s32f_stddev_32f_a16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for(;number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the standard deviation of the input buffer using the supplied mean + \param stddev The calculated standard deviation + \param inputBuffer The buffer of points to calculate the std deviation for + \param mean The mean of the input buffer + \param num_points The number of values in input buffer to used in the stddev calculation +*/ +static inline void volk_32f_s32f_stddev_32f_a16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ + float returnValue = 0; + if(num_points > 0){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + aPtr++; + } + + returnValue /= num_points; + returnValue -= (mean * mean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_sqrt_32f_a16.h b/volk/include/volk/volk_32f_sqrt_32f_a16.h new file mode 100644 index 000000000..513c2cffe --- /dev/null +++ b/volk/include/volk/volk_32f_sqrt_32f_a16.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_32f_sqrt_32f_a16_H +#define INCLUDED_volk_32f_sqrt_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_32f_a16_sse(float* cVector, const float* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + + __m128 aVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + + cVal = _mm_sqrt_ps(aVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = sqrtf(*aPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_32f_a16_generic(float* cVector, const float* aVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = sqrtf(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +extern void volk_32f_sqrt_32f_a16_orc_impl(float *, const float*, unsigned int); +/*! + \brief Sqrts the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be sqrted + \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector +*/ +static inline void volk_32f_sqrt_32f_a16_orc(float* cVector, const float* aVector, unsigned int num_points){ + volk_32f_sqrt_32f_a16_orc_impl(cVector, aVector, num_points); +} + +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_volk_32f_sqrt_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_sqrt_aligned16.h b/volk/include/volk/volk_32f_sqrt_aligned16.h deleted file mode 100644 index f6996ad5f..000000000 --- a/volk/include/volk/volk_32f_sqrt_aligned16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_SQRT_ALIGNED16_H -#define INCLUDED_VOLK_32f_SQRT_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Sqrts the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be sqrted - \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector -*/ -static inline void volk_32f_sqrt_aligned16_sse(float* cVector, const float* aVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - - __m128 aVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - - cVal = _mm_sqrt_ps(aVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = sqrtf(*aPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Sqrts the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be sqrted - \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector -*/ -static inline void volk_32f_sqrt_aligned16_generic(float* cVector, const float* aVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = sqrtf(*aPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -extern void volk_32f_sqrt_aligned16_orc_impl(float *, const float*, unsigned int); -/*! - \brief Sqrts the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be sqrted - \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector -*/ -static inline void volk_32f_sqrt_aligned16_orc(float* cVector, const float* aVector, unsigned int num_points){ - volk_32f_sqrt_aligned16_orc_impl(cVector, aVector, num_points); -} - -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_32f_SQRT_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_stddev_aligned16.h b/volk/include/volk/volk_32f_stddev_aligned16.h deleted file mode 100644 index 1c6a08437..000000000 --- a/volk/include/volk/volk_32f_stddev_aligned16.h +++ /dev/null @@ -1,144 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H -#define INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Calculates the standard deviation of the input buffer using the supplied mean - \param stddev The calculated standard deviation - \param inputBuffer The buffer of points to calculate the std deviation for - \param mean The mean of the input buffer - \param num_points The number of values in input buffer to used in the stddev calculation -*/ -static inline void volk_32f_stddev_aligned16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ - float returnValue = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const float* aPtr = inputBuffer; - - float squareBuffer[4] __attribute__((aligned(128))); - - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal1, aVal2, aVal3, aVal4; - __m128 cVal1, cVal2, cVal3, cVal4; - for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); - - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); - - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); - - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 - } - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; - } - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the standard deviation of the input buffer using the supplied mean - \param stddev The calculated standard deviation - \param inputBuffer The buffer of points to calculate the std deviation for - \param mean The mean of the input buffer - \param num_points The number of values in input buffer to used in the stddev calculation -*/ -static inline void volk_32f_stddev_aligned16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ - float returnValue = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - - float squareBuffer[4] __attribute__((aligned(128))); - - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - for(;number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); // aVal = x - aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 - squareAccumulator = _mm_add_ps(squareAccumulator, aVal); - aPtr += 4; - } - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; - } - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the standard deviation of the input buffer using the supplied mean - \param stddev The calculated standard deviation - \param inputBuffer The buffer of points to calculate the std deviation for - \param mean The mean of the input buffer - \param num_points The number of values in input buffer to used in the stddev calculation -*/ -static inline void volk_32f_stddev_aligned16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){ - float returnValue = 0; - if(num_points > 0){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - aPtr++; - } - - returnValue /= num_points; - returnValue -= (mean * mean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h new file mode 100644 index 000000000..2ba809845 --- /dev/null +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h @@ -0,0 +1,169 @@ +#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H +#define INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_32f_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + float meanBuffer[4] __attribute__((aligned(128))); + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for(;number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x + + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x + + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x + + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_32f_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + float meanBuffer[4] __attribute__((aligned(128))); + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for(;number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_32f_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h b/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h deleted file mode 100644 index 1cd502257..000000000 --- a/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h +++ /dev/null @@ -1,169 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H -#define INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_aligned16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const float* aPtr = inputBuffer; - float meanBuffer[4] __attribute__((aligned(128))); - float squareBuffer[4] __attribute__((aligned(128))); - - __m128 accumulator = _mm_setzero_ps(); - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal1, aVal2, aVal3, aVal4; - __m128 cVal1, cVal2, cVal3, cVal4; - for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); - accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x - - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); - accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x - - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); - accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x - - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); - accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 - } - _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_aligned16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - float meanBuffer[4] __attribute__((aligned(128))); - float squareBuffer[4] __attribute__((aligned(128))); - - __m128 accumulator = _mm_setzero_ps(); - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - for(;number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); // aVal = x - accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x - aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 - squareAccumulator = _mm_add_ps(squareAccumulator, aVal); - aPtr += 4; - } - _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_aligned16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_subtract_aligned16.h b/volk/include/volk/volk_32f_subtract_aligned16.h deleted file mode 100644 index e15242901..000000000 --- a/volk/include/volk/volk_32f_subtract_aligned16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H -#define INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -static inline void volk_32f_subtract_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_sub_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -static inline void volk_32f_subtract_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -extern void volk_32f_subtract_aligned16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_subtract_aligned16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_subtract_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32f_sum_of_poly_aligned16.h b/volk/include/volk/volk_32f_sum_of_poly_aligned16.h deleted file mode 100644 index a326e62b1..000000000 --- a/volk/include/volk/volk_32f_sum_of_poly_aligned16.h +++ /dev/null @@ -1,151 +0,0 @@ -#ifndef INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H -#define INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H - -#include -#include -#include - -#ifndef MAX -#define MAX(X,Y) ((X) > (Y)?(X):(Y)) -#endif - -#if LV_HAVE_SSE3 -#include -#include - -static inline void volk_32f_sum_of_poly_aligned16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - - - float result = 0.0; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - //float fith = 0.0; - - - - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; - - xmm9 = _mm_setzero_ps(); - xmm1 = _mm_setzero_ps(); - - xmm0 = _mm_load1_ps(¢er_point_array[0]); - xmm6 = _mm_load1_ps(¢er_point_array[1]); - xmm7 = _mm_load1_ps(¢er_point_array[2]); - xmm8 = _mm_load1_ps(¢er_point_array[3]); - //xmm11 = _mm_load1_ps(¢er_point_array[4]); - xmm10 = _mm_load1_ps(cutoff); - - int bound = num_bytes >> 4; - int leftovers = (num_bytes >> 2) & 3; - int i = 0; - - for(; i < bound; ++i) { - xmm2 = _mm_load_ps(src0); - xmm2 = _mm_max_ps(xmm10, xmm2); - xmm3 = _mm_mul_ps(xmm2, xmm2); - xmm4 = _mm_mul_ps(xmm2, xmm3); - xmm5 = _mm_mul_ps(xmm3, xmm3); - //xmm12 = _mm_mul_ps(xmm3, xmm4); - - xmm2 = _mm_mul_ps(xmm2, xmm0); - xmm3 = _mm_mul_ps(xmm3, xmm6); - xmm4 = _mm_mul_ps(xmm4, xmm7); - xmm5 = _mm_mul_ps(xmm5, xmm8); - //xmm12 = _mm_mul_ps(xmm12, xmm11); - - xmm2 = _mm_add_ps(xmm2, xmm3); - xmm3 = _mm_add_ps(xmm4, xmm5); - - src0 += 4; - - xmm9 = _mm_add_ps(xmm2, xmm9); - - xmm1 = _mm_add_ps(xmm3, xmm1); - - //xmm9 = _mm_add_ps(xmm12, xmm9); - } - - xmm2 = _mm_hadd_ps(xmm9, xmm1); - xmm3 = _mm_hadd_ps(xmm2, xmm2); - xmm4 = _mm_hadd_ps(xmm3, xmm3); - - _mm_store_ss(&result, xmm4); - - - - for(i = 0; i < leftovers; ++i) { - fst = src0[i]; - fst = MAX(fst, *cutoff); - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth);// + - //center_point_array[4] * fith); - } - - result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; - - target[0] = result; -} - - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_GENERIC - -static inline void volk_32f_sum_of_poly_aligned16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - - - - float result = 0.0; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - //float fith = 0.0; - - - - int i = 0; - - for(; i < num_bytes >> 2; ++i) { - fst = src0[i]; - fst = MAX(fst, *cutoff); - - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth); //+ - //center_point_array[4] * fith); - /*printf("%f12...%d\n", (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth) + - //center_point_array[4] * fith) + - (center_point_array[4]), i); - */ - } - - result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); - - - - *target = result; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h new file mode 100644 index 000000000..514998800 --- /dev/null +++ b/volk/include/volk/volk_32fc_32f_multiply_32fc_a16.h @@ -0,0 +1,95 @@ +#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a16_H +#define INCLUDED_volk_32fc_32f_multiply_32fc_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies the input complex vector with the input float vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the float values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32f_multiply_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; + for(;number < quarterPoints; number++){ + + aVal1 = _mm_load_ps((const float*)aPtr); + aPtr += 2; + + aVal2 = _mm_load_ps((const float*)aPtr); + aPtr += 2; + + bVal = _mm_load_ps(bPtr); + bPtr += 4; + + bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0)); + bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2)); + + cVal = _mm_mul_ps(aVal1, bVal1); + + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container + cPtr += 2; + + cVal = _mm_mul_ps(aVal2, bVal2); + + _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container + + cPtr += 2; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr); + bPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32f_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The complex vector to be multiplied + \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_32f_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32fc_32f_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ + volk_32fc_32f_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_GENERIC */ + + + +#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h deleted file mode 100644 index 304ed8e2d..000000000 --- a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H -#define INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies the input complex vector with the input float vector and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The complex vector to be multiplied - \param bVector The vectors containing the float values to be multiplied against each complex value in aVector - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_32f_multiply_aligned16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal; - for(;number < quarterPoints; number++){ - - aVal1 = _mm_load_ps((const float*)aPtr); - aPtr += 2; - - aVal2 = _mm_load_ps((const float*)aPtr); - aPtr += 2; - - bVal = _mm_load_ps(bPtr); - bPtr += 4; - - bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0)); - bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2)); - - cVal = _mm_mul_ps(aVal1, bVal1); - - _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container - cPtr += 2; - - cVal = _mm_mul_ps(aVal2, bVal2); - - _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container - - cPtr += 2; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr); - bPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The complex vector to be multiplied - \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_32f_multiply_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The complex vector to be multiplied - \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector - */ -extern void volk_32fc_32f_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32fc_32f_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){ - volk_32fc_32f_multiply_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_GENERIC */ - - - -#endif /* INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_32f_power_32fc_a16.h new file mode 100644 index 000000000..6f9e9e3ee --- /dev/null +++ b/volk/include/volk/volk_32fc_32f_power_32fc_a16.h @@ -0,0 +1,109 @@ +#ifndef INCLUDED_volk_32fc_32f_power_32fc_a16_H +#define INCLUDED_volk_32fc_32f_power_32fc_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32fc_32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + +#if LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + + __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + cplxValue2 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + // Convert to polar coordinates + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + phase = atan2f4(qValue, iValue); // Calculate the Phase + + magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values + + // Now calculate the power of the polar coordinate data + magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power + + phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power + + // Convert back to cartesian coordinates + iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude + qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude + + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values + + _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container + + cPtr += 2; + + _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container + + cPtr += 2; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + lv_32fc_t complexPower; + ((float*)&complexPower)[0] = power; + ((float*)&complexPower)[1] = 0; + for(;number < num_points; number++){ + *cPtr++ = lv_cpow((*aPtr++), complexPower); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector + */ +static inline void volk_32fc_32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + lv_32fc_t complexPower; + ((float*)&complexPower)[0] = power; + ((float*)&complexPower)[1] = 0.0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_cpow((*aPtr++), complexPower); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_32f_power_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h b/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h deleted file mode 100644 index 2d71ee4f8..000000000 --- a/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H -#define INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Takes each the input complex vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The complex vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector -*/ -static inline void volk_32fc_32f_power_32fc_aligned16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - -#if LV_HAVE_LIB_SIMDMATH - __m128 vPower = _mm_set_ps1(power); - - __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps((float*)aPtr); - aPtr += 2; - - cplxValue2 = _mm_load_ps((float*)aPtr); - aPtr += 2; - - // Convert to polar coordinates - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - phase = atan2f4(qValue, iValue); // Calculate the Phase - - magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values - - // Now calculate the power of the polar coordinate data - magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power - - phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power - - // Convert back to cartesian coordinates - iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude - qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude - - cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values - cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values - - _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container - - cPtr += 2; - - _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container - - cPtr += 2; - } - - number = quarterPoints * 4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - - lv_32fc_t complexPower; - ((float*)&complexPower)[0] = power; - ((float*)&complexPower)[1] = 0; - for(;number < num_points; number++){ - *cPtr++ = lv_cpow((*aPtr++), complexPower); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Takes each the input complex vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The complex vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector - */ -static inline void volk_32fc_32f_power_32fc_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; - lv_32fc_t complexPower; - ((float*)&complexPower)[0] = power; - ((float*)&complexPower)[1] = 0.0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = lv_cpow((*aPtr++), complexPower); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h new file mode 100644 index 000000000..cd9cc8160 --- /dev/null +++ b/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h @@ -0,0 +1,344 @@ +#ifndef INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H +#define INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H + +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + + } + /* + for(i = 0; i < num_bytes >> 3; ++i) { + *result += input[i] * conjf(taps[i]); + } + */ +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + + + + asm volatile + ( + "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %[conjugator], %%r9\n\t" + " movq %%rcx, %%rax\n\t" + " movaps 0(%%r9), %%xmm8\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movups 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " xorps %%xmm8, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " xorps %%xmm8, %%xmm3\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + " xorps %%xmm8, %%xmm2\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); + + } + + return; +} +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 +static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + int bound = num_bytes >> 4; + int leftovers = num_bytes % 16; + + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " #movl 12(%%ebp), %%eax # input\n\t" + " #movl 16(%%ebp), %%edx # taps\n\t" + " #movl 20(%%ebp), %%ecx # n_bytes\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%[eax]), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%[edx]), %%xmm2\n\t" + " movl %[ecx], (%[out])\n\t" + " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" + + " xorps %%xmm1, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%[eax]), %%xmmA\n\t" + "# movaps (%[edx]), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%[edx]), %%xmm3\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " xorps %%xmm1, %%xmm3\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " movaps 16(%[eax]), %%xmm1\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%[eax]), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %[eax]\n\t" + " movaps 32(%[edx]), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " xorps %%xmm1, %%xmm2\n\t" + " addl $32, %[edx]\n\t" + ".%=L1_test:\n\t" + " decl %[ecx]\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" + " shrl $4, %[ecx]\n\t" + " andl $1, %[ecx]\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " #movl 8(%%ebp), %[eax] \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%[out])\n\t" + " movss (%[out]), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %[eax] # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) + ); + + + + + printf("%d, %d\n", leftovers, bound); + + for(; leftovers > 0; leftovers -= 8) { + + + *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + + + +#endif /*INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h new file mode 100644 index 000000000..2ccfcf2f2 --- /dev/null +++ b/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h @@ -0,0 +1,468 @@ +#ifndef INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H +#define INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H + +#include +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_32fc_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; + + } + +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + asm + ( + "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %%rcx, %%rax\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 + +static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " movl 12(%%ebp), %%eax # input\n\t" + " movl 16(%%ebp), %%edx # taps\n\t" + " movl 20(%%ebp), %%ecx # n_bytes\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%eax), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%edx), %%xmm2\n\t" + " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%eax), %%xmmA\n\t" + "# movaps (%%edx), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%eax), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%edx), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%eax), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %%eax\n\t" + " movaps 32(%%edx), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " addl $32, %%edx\n\t" + ".%=L1_test:\n\t" + " decl %%ecx\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" + " shrl $4, %%ecx\n\t" + " andl $1, %%ecx\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " movl 8(%%ebp), %%eax \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%%eax)\n\t" + " movss (%%eax), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %%eax # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : + : "eax", "ecx", "edx" + ); + + + int getem = num_bytes % 16; + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(float)); + + unsigned int number = 0; + const unsigned int halfPoints = num_bytes >> 4; + + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; + + dotProdVal = _mm_setzero_ps(); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + + a += 2; + b += 2; + } + + lv_32fc_t dotProductVector[2] __attribute__((aligned(16))); + + _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + + if((num_bytes >> 2) != 0) { + dotProduct += (*a) * (*b); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + volk_32fc_32fc_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes); + // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now + /* + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; + float *p_input, *p_taps; + __m64 *p_result; + + p_result = (__m64*)result; + p_input = (float*)input; + p_taps = (float*)taps; + + static const __m128i neg = {0x000000000000000080000000}; + + int i = 0; + + int bound = (num_bytes >> 5); + int leftovers = (num_bytes & 24) >> 3; + + real0 = _mm_sub_ps(real0, real0); + real1 = _mm_sub_ps(real1, real1); + im0 = _mm_sub_ps(im0, im0); + im1 = _mm_sub_ps(im1, im1); + + for(; i < bound; ++i) { + + + xmm0 = _mm_load_ps(p_input); + xmm1 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm2 = _mm_load_ps(p_input); + xmm3 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm4 = _mm_unpackhi_ps(xmm0, xmm2); + xmm5 = _mm_unpackhi_ps(xmm1, xmm3); + xmm0 = _mm_unpacklo_ps(xmm0, xmm2); + xmm2 = _mm_unpacklo_ps(xmm1, xmm3); + + //imaginary vector from input + xmm1 = _mm_unpackhi_ps(xmm0, xmm4); + //real vector from input + xmm3 = _mm_unpacklo_ps(xmm0, xmm4); + //imaginary vector from taps + xmm0 = _mm_unpackhi_ps(xmm2, xmm5); + //real vector from taps + xmm2 = _mm_unpacklo_ps(xmm2, xmm5); + + xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); + xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); + + xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); + xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); + + real0 = _mm_add_ps(xmm4, real0); + real1 = _mm_add_ps(xmm5, real1); + im0 = _mm_add_ps(xmm6, im0); + im1 = _mm_add_ps(xmm7, im1); + + } + + + + + real1 = _mm_xor_ps(real1, (__m128)neg); + + + im0 = _mm_add_ps(im0, im1); + real0 = _mm_add_ps(real0, real1); + + im0 = _mm_add_ps(im0, real0); + + _mm_storel_pi(p_result, im0); + + for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { + + *result += input[i] * taps[i]; + } + */ +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h new file mode 100644 index 000000000..59259882c --- /dev/null +++ b/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h @@ -0,0 +1,95 @@ +#ifndef INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H +#define INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H + +#include +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32fc_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_32fc_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_32fc_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points); +static inline void volk_32fc_32fc_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + static const float mask = -0.0; + volk_32fc_32fc_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, mask, num_points); +} +#endif /* LV_HAVE_ORC */ + + + + + +#endif /* INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h new file mode 100644 index 000000000..14f511697 --- /dev/null +++ b/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H +#define INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H + +#include +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + lv_32fc_t diff; + memset(&diff, 0x0, 2*sizeof(float)); + + float sq_dist = 0.0; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm8 = _mm_load1_ps(&scalar); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h new file mode 100644 index 000000000..b6c72adbf --- /dev/null +++ b/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h @@ -0,0 +1,112 @@ +#ifndef INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H +#define INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_32fc_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + lv_32fc_t diff; + float sq_dist; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + _mm_store_ps(target, xmm4); + + target += 4; + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_32fc_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_atan2_32f_aligned16.h b/volk/include/volk/volk_32fc_atan2_32f_aligned16.h deleted file mode 100644 index df0ebb987..000000000 --- a/volk/include/volk/volk_32fc_atan2_32f_aligned16.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H -#define INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief performs the atan2 on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). - \param normalizeFactor The atan2 results will be divided by this normalization factor. - \param num_points The number of complex values in the input vector. -*/ -static inline void volk_32fc_atan2_32f_aligned16_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* outPtr = outputVector; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - const float invNormalizeFactor = 1.0 / normalizeFactor; - -#if LV_HAVE_LIB_SIMDMATH - __m128 testVector = _mm_set_ps1(2*M_PI); - __m128 correctVector = _mm_set_ps1(M_PI); - __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); - __m128 phase; - __m128 complex1, complex2, iValue, qValue; - __m128 keepMask; - - for (; number < quarterPoints; number++) { - // Load IQ data: - complex1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - complex2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - // Deinterleave IQ data: - iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); - // Arctan to get phase: - phase = atan2f4(qValue, iValue); - // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. - // Compare to 2pi: - keepMask = _mm_cmpneq_ps(phase,testVector); - phase = _mm_blendv_ps(correctVector, phase, keepMask); - // done with above correction. - phase = _mm_mul_ps(phase, vNormalizeFactor); - _mm_store_ps((float*)outPtr, phase); - outPtr += 4; - } - number = quarterPoints * 4; -#endif /* LV_HAVE_SIMDMATH_H */ - - for (; number < num_points; number++) { - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } -} -#endif /* LV_HAVE_SSE4_1 */ - - -#if LV_HAVE_SSE -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief performs the atan2 on the input vector and stores the results in the output vector. - \param outputVector The byte-aligned vector where the results will be stored. - \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). - \param normalizeFactor The atan2 results will be divided by this normalization factor. - \param num_points The number of complex values in the input vector. -*/ -static inline void volk_32fc_atan2_32f_aligned16_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* outPtr = outputVector; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - const float invNormalizeFactor = 1.0 / normalizeFactor; - -#if LV_HAVE_LIB_SIMDMATH - __m128 testVector = _mm_set_ps1(2*M_PI); - __m128 correctVector = _mm_set_ps1(M_PI); - __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); - __m128 phase; - __m128 complex1, complex2, iValue, qValue; - __m128 mask; - __m128 keepMask; - - for (; number < quarterPoints; number++) { - // Load IQ data: - complex1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - complex2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - // Deinterleave IQ data: - iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); - // Arctan to get phase: - phase = atan2f4(qValue, iValue); - // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. - // Compare to 2pi: - keepMask = _mm_cmpneq_ps(phase,testVector); - phase = _mm_and_ps(phase, keepMask); - mask = _mm_andnot_ps(keepMask, correctVector); - phase = _mm_or_ps(phase, mask); - // done with above correction. - phase = _mm_mul_ps(phase, vNormalizeFactor); - _mm_store_ps((float*)outPtr, phase); - outPtr += 4; - } - number = quarterPoints * 4; -#endif /* LV_HAVE_SIMDMATH_H */ - - for (; number < num_points; number++) { - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief performs the atan2 on the input vector and stores the results in the output vector. - \param outputVector The vector where the results will be stored. - \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin). - \param normalizeFactor The atan2 results will be divided by this normalization factor. - \param num_points The number of complex values in the input vector. -*/ -static inline void volk_32fc_atan2_32f_aligned16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ - float* outPtr = outputVector; - const float* inPtr = (float*)inputVector; - const float invNormalizeFactor = 1.0 / normalizeFactor; - unsigned int number; - for ( number = 0; number < num_points; number++) { - const float real = *inPtr++; - const float imag = *inPtr++; - *outPtr++ = atan2f(imag, real) * invNormalizeFactor; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h b/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h deleted file mode 100644 index 60103c1b5..000000000 --- a/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h +++ /dev/null @@ -1,344 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H -#define INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H - -#include -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32fc_conjugate_dot_prod_aligned16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - - sum0[0] += in[0] * tp[0] + in[1] * tp[1]; - sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] + in[3] * tp[3]; - sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); - - } - /* - for(i = 0; i < num_bytes >> 3; ++i) { - *result += input[i] * conjf(taps[i]); - } - */ -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE && LV_HAVE_64 - - -static inline void volk_32fc_conjugate_dot_prod_aligned16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - - - - asm volatile - ( - "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %[conjugator], %%r9\n\t" - " movq %%rcx, %%rax\n\t" - " movaps 0(%%r9), %%xmm8\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movups 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " xorps %%xmm8, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " xorps %%xmm8, %%xmm3\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - " xorps %%xmm8, %%xmm2\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) - :"rax", "r8", "r9", "r10" - ); - - - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); - - } - - return; -} -#endif - -#if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_conjugate_dot_prod_aligned16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - int bound = num_bytes >> 4; - int leftovers = num_bytes % 16; - - - asm volatile - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " #movl 12(%%ebp), %%eax # input\n\t" - " #movl 16(%%ebp), %%edx # taps\n\t" - " #movl 20(%%ebp), %%ecx # n_bytes\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%[eax]), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%[edx]), %%xmm2\n\t" - " movl %[ecx], (%[out])\n\t" - " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" - - " xorps %%xmm1, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%[eax]), %%xmmA\n\t" - "# movaps (%[edx]), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%[edx]), %%xmm3\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " xorps %%xmm1, %%xmm3\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " movaps 16(%[eax]), %%xmm1\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%[eax]), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %[eax]\n\t" - " movaps 32(%[edx]), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " xorps %%xmm1, %%xmm2\n\t" - " addl $32, %[edx]\n\t" - ".%=L1_test:\n\t" - " decl %[ecx]\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" - " shrl $4, %[ecx]\n\t" - " andl $1, %[ecx]\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " #movl 8(%%ebp), %[eax] \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%[out])\n\t" - " movss (%[out]), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %[eax] # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) - ); - - - - - printf("%d, %d\n", leftovers, bound); - - for(; leftovers > 0; leftovers -= 8) { - - - *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); - - } - - return; - - - - - - -} - -#endif /*LV_HAVE_SSE*/ - - - -#endif /*INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h new file mode 100644 index 000000000..3ee579c2e --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H +#define INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h deleted file mode 100644 index 02085cd1e..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H -#define INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h new file mode 100644 index 000000000..404defc36 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h @@ -0,0 +1,78 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H +#define INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_64f_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(qBufferPtr, dVal); + + iBufferPtr += 2; + qBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_64f_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h deleted file mode 100644 index 3d9ebccdd..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H -#define INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_aligned16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - - for(;number < halfPoints; number++){ - - cplxValue = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(iBufferPtr, dVal); - - // Arrange in q1q2q1q2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(qBufferPtr, dVal); - - iBufferPtr += 2; - qBufferPtr += 2; - } - - number = halfPoints * 2; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_aligned16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - *qBufferPtr++ = (double)*complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h new file mode 100644 index 000000000..53235e5f7 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h @@ -0,0 +1,80 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_real_16s_a16_H +#define INCLUDED_volk_32fc_deinterleave_real_16s_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_16s_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + iValue = _mm_mul_ps(iValue, vScalar); + + _mm_store_ps(floatBuffer, iValue); + *iBufferPtr++ = (int16_t)(floatBuffer[0]); + *iBufferPtr++ = (int16_t)(floatBuffer[1]); + *iBufferPtr++ = (int16_t)(floatBuffer[2]); + *iBufferPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + iBufferPtr = &iBuffer[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_real_16s_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h deleted file mode 100644 index 3026b2422..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H -#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_16s_aligned16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, iValue; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - - iValue = _mm_mul_ps(iValue, vScalar); - - _mm_store_ps(floatBuffer, iValue); - *iBufferPtr++ = (int16_t)(floatBuffer[0]); - *iBufferPtr++ = (int16_t)(floatBuffer[1]); - *iBufferPtr++ = (int16_t)(floatBuffer[2]); - *iBufferPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - iBufferPtr = &iBuffer[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h new file mode 100644 index 000000000..9838ec88b --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_a16.h @@ -0,0 +1,68 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a16_H +#define INCLUDED_volk_32fc_deinterleave_real_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + float* iBufferPtr = iBuffer; + + __m128 cplxValue1, cplxValue2, iValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h deleted file mode 100644 index 2af973bcc..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H -#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - float* iBufferPtr = iBuffer; - - __m128 cplxValue1, cplxValue2, iValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h new file mode 100644 index 000000000..af392d074 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_a16.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a16_H +#define INCLUDED_volk_32fc_deinterleave_real_64f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_64f_a16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + iBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_64f_a16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h deleted file mode 100644 index f408589c4..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H -#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_64f_aligned16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - for(;number < halfPoints; number++){ - - cplxValue = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(iBufferPtr, dVal); - - iBufferPtr += 2; - } - - number = halfPoints * 2; - for(; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_64f_aligned16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_dot_prod_aligned16.h b/volk/include/volk/volk_32fc_dot_prod_aligned16.h deleted file mode 100644 index 1a834dc25..000000000 --- a/volk/include/volk/volk_32fc_dot_prod_aligned16.h +++ /dev/null @@ -1,468 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H -#define INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H - -#include -#include -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32fc_dot_prod_aligned16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; - sum0[1] += in[0] * tp[1] + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] - in[3] * tp[3]; - sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; - - } - -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE && LV_HAVE_64 - - -static inline void volk_32fc_dot_prod_aligned16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - - asm - ( - "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %%rcx, %%rax\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) - :"rax", "r8", "r9", "r10" - ); - - - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - - } - - return; - -} - -#endif - -#if LV_HAVE_SSE && LV_HAVE_32 - -static inline void volk_32fc_dot_prod_aligned16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - asm volatile - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " movl 12(%%ebp), %%eax # input\n\t" - " movl 16(%%ebp), %%edx # taps\n\t" - " movl 20(%%ebp), %%ecx # n_bytes\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%eax), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%edx), %%xmm2\n\t" - " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%eax), %%xmmA\n\t" - "# movaps (%%edx), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%eax), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%edx), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%eax), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %%eax\n\t" - " movaps 32(%%edx), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " addl $32, %%edx\n\t" - ".%=L1_test:\n\t" - " decl %%ecx\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" - " shrl $4, %%ecx\n\t" - " andl $1, %%ecx\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " movl 8(%%ebp), %%eax \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%%eax)\n\t" - " movss (%%eax), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %%eax # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : - : "eax", "ecx", "edx" - ); - - - int getem = num_bytes % 16; - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - - } - - return; - - - - - - -} - -#endif /*LV_HAVE_SSE*/ - -#if LV_HAVE_SSE3 - -#include - -static inline void volk_32fc_dot_prod_aligned16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); - - unsigned int number = 0; - const unsigned int halfPoints = num_bytes >> 4; - - __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; - - dotProdVal = _mm_setzero_ps(); - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together - - a += 2; - b += 2; - } - - lv_32fc_t dotProductVector[2] __attribute__((aligned(16))); - - _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct += ( dotProductVector[0] + dotProductVector[1] ); - - if((num_bytes >> 2) != 0) { - dotProduct += (*a) * (*b); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_SSE4_1 - -#include - -static inline void volk_32fc_dot_prod_aligned16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - volk_32fc_dot_prod_aligned16_sse3(result, input, taps, num_bytes); - // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now - /* - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; - float *p_input, *p_taps; - __m64 *p_result; - - p_result = (__m64*)result; - p_input = (float*)input; - p_taps = (float*)taps; - - static const __m128i neg = {0x000000000000000080000000}; - - int i = 0; - - int bound = (num_bytes >> 5); - int leftovers = (num_bytes & 24) >> 3; - - real0 = _mm_sub_ps(real0, real0); - real1 = _mm_sub_ps(real1, real1); - im0 = _mm_sub_ps(im0, im0); - im1 = _mm_sub_ps(im1, im1); - - for(; i < bound; ++i) { - - - xmm0 = _mm_load_ps(p_input); - xmm1 = _mm_load_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm2 = _mm_load_ps(p_input); - xmm3 = _mm_load_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm4 = _mm_unpackhi_ps(xmm0, xmm2); - xmm5 = _mm_unpackhi_ps(xmm1, xmm3); - xmm0 = _mm_unpacklo_ps(xmm0, xmm2); - xmm2 = _mm_unpacklo_ps(xmm1, xmm3); - - //imaginary vector from input - xmm1 = _mm_unpackhi_ps(xmm0, xmm4); - //real vector from input - xmm3 = _mm_unpacklo_ps(xmm0, xmm4); - //imaginary vector from taps - xmm0 = _mm_unpackhi_ps(xmm2, xmm5); - //real vector from taps - xmm2 = _mm_unpacklo_ps(xmm2, xmm5); - - xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); - xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); - - xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); - xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); - - real0 = _mm_add_ps(xmm4, real0); - real1 = _mm_add_ps(xmm5, real1); - im0 = _mm_add_ps(xmm6, im0); - im1 = _mm_add_ps(xmm7, im1); - - } - - - - - real1 = _mm_xor_ps(real1, (__m128)neg); - - - im0 = _mm_add_ps(im0, im1); - real0 = _mm_add_ps(real0, real1); - - im0 = _mm_add_ps(im0, real0); - - _mm_storel_pi(p_result, im0); - - for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { - - *result += input[i] * taps[i]; - } - */ -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_index_max_16u_a16.h b/volk/include/volk/volk_32fc_index_max_16u_a16.h new file mode 100644 index 000000000..532ae4e7c --- /dev/null +++ b/volk/include/volk/volk_32fc_index_max_16u_a16.h @@ -0,0 +1,215 @@ +#ifndef INCLUDED_volk_32fc_index_max_16u_a16_H +#define INCLUDED_volk_32fc_index_max_16u_a16_H + +#include +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + + +static inline void volk_32fc_index_max_16u_a16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { + + + + union bit128 holderf; + union bit128 holderi; + float sq_dist = 0.0; + + + + + union bit128 xmm5, xmm4; + __m128 xmm1, xmm2, xmm3; + __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; + + xmm5.int_vec = xmmfive = _mm_setzero_si128(); + xmm4.int_vec = xmmfour = _mm_setzero_si128(); + holderf.int_vec = holder0 = _mm_setzero_si128(); + holderi.int_vec = holder1 = _mm_setzero_si128(); + + + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + + xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! + xmm9 = xmm8 = _mm_setzero_si128(); + xmm10 = _mm_set_epi32(4, 4, 4, 4); + xmm3 = _mm_setzero_ps(); +; + + //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); + + for(; i < bound; ++i) { + + xmm1 = _mm_load_ps((float*)src0); + xmm2 = _mm_load_ps((float*)&src0[2]); + + + src0 += 4; + + + xmm1 = _mm_mul_ps(xmm1, xmm1); + xmm2 = _mm_mul_ps(xmm2, xmm2); + + + xmm1 = _mm_hadd_ps(xmm1, xmm2); + + xmm3 = _mm_max_ps(xmm1, xmm3); + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + + + //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); + + } + + + for(i = 0; i < leftovers0; ++i) { + + + xmm2 = _mm_load_ps((float*)src0); + + xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8); + xmm8 = (__m128i)xmm1; + + xmm2 = _mm_mul_ps(xmm2, xmm2); + + src0 += 2; + + xmm1 = _mm_hadd_ps(xmm2, xmm2); + + xmm3 = _mm_max_ps(xmm1, xmm3); + + xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); + + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + + xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + xmm8 = _mm_add_epi32(xmm8, xmm10); + //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + } + + + + + for(i = 0; i < leftovers1; ++i) { + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + + sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); + + xmm2 = _mm_load1_ps(&sq_dist); + + xmm1 = xmm3; + + xmm3 = _mm_max_ss(xmm3, xmm2); + + + + xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); + xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); + + + xmm8 = _mm_shuffle_epi32(xmm8, 0x00); + + xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); + xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); + + + xmm9 = _mm_add_epi32(xmm11, xmm12); + + } + + //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); + + //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); + + _mm_store_ps((float*)&(holderf.f), xmm3); + _mm_store_si128(&(holderi.int_vec), xmm9); + + target[0] = holderi.i[0]; + sq_dist = holderf.f[0]; + target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; + sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; + target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; + sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; + target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; + sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; + + + + /* + float placeholder = 0.0; + uint32_t temp0, temp1; + unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); + unsigned int l0 = g0 ^ 1; + + unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); + unsigned int l1 = g1 ^ 1; + + temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; + temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; + sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; + placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; + + g0 = (sq_dist > placeholder); + l0 = g0 ^ 1; + target[0] = g0 * temp0 + l0 * temp1; + */ + +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_index_max_16u_a16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { + float sq_dist = 0.0; + float max = 0.0; + unsigned int index = 0; + + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + + sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); + + index = sq_dist > max ? i : index; + max = sq_dist > max ? sq_dist : max; + + + } + target[0] = index; + +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_index_max_16u_a16_H*/ diff --git a/volk/include/volk/volk_32fc_index_max_aligned16.h b/volk/include/volk/volk_32fc_index_max_aligned16.h deleted file mode 100644 index d77a95f90..000000000 --- a/volk/include/volk/volk_32fc_index_max_aligned16.h +++ /dev/null @@ -1,215 +0,0 @@ -#ifndef INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H -#define INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H - -#include -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -#include - - -static inline void volk_32fc_index_max_aligned16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { - - - - union bit128 holderf; - union bit128 holderi; - float sq_dist = 0.0; - - - - - union bit128 xmm5, xmm4; - __m128 xmm1, xmm2, xmm3; - __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10; - - xmm5.int_vec = xmmfive = _mm_setzero_si128(); - xmm4.int_vec = xmmfour = _mm_setzero_si128(); - holderf.int_vec = holder0 = _mm_setzero_si128(); - holderi.int_vec = holder1 = _mm_setzero_si128(); - - - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - - xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order! - xmm9 = xmm8 = _mm_setzero_si128(); - xmm10 = _mm_set_epi32(4, 4, 4, 4); - xmm3 = _mm_setzero_ps(); -; - - //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]); - - for(; i < bound; ++i) { - - xmm1 = _mm_load_ps((float*)src0); - xmm2 = _mm_load_ps((float*)&src0[2]); - - - src0 += 4; - - - xmm1 = _mm_mul_ps(xmm1, xmm1); - xmm2 = _mm_mul_ps(xmm2, xmm2); - - - xmm1 = _mm_hadd_ps(xmm1, xmm2); - - xmm3 = _mm_max_ps(xmm1, xmm3); - - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - - xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - - xmm9 = _mm_add_epi32(xmm11, xmm12); - - xmm8 = _mm_add_epi32(xmm8, xmm10); - - - //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]); - - } - - - for(i = 0; i < leftovers0; ++i) { - - - xmm2 = _mm_load_ps((float*)src0); - - xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8); - xmm8 = (__m128i)xmm1; - - xmm2 = _mm_mul_ps(xmm2, xmm2); - - src0 += 2; - - xmm1 = _mm_hadd_ps(xmm2, xmm2); - - xmm3 = _mm_max_ps(xmm1, xmm3); - - xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]); - - - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - - xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); - - xmm9 = _mm_add_epi32(xmm11, xmm12); - - xmm8 = _mm_add_epi32(xmm8, xmm10); - //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - - } - - - - - for(i = 0; i < leftovers1; ++i) { - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - - - sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); - - xmm2 = _mm_load1_ps(&sq_dist); - - xmm1 = xmm3; - - xmm3 = _mm_max_ss(xmm3, xmm2); - - - - xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3); - xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); - - - xmm8 = _mm_shuffle_epi32(xmm8, 0x00); - - xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); - xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); - - - xmm9 = _mm_add_epi32(xmm11, xmm12); - - } - - //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]); - - //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]); - - _mm_store_ps((float*)&(holderf.f), xmm3); - _mm_store_si128(&(holderi.int_vec), xmm9); - - target[0] = holderi.i[0]; - sq_dist = holderf.f[0]; - target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0]; - sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist; - target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0]; - sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist; - target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0]; - sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist; - - - - /* - float placeholder = 0.0; - uint32_t temp0, temp1; - unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]); - unsigned int l0 = g0 ^ 1; - - unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]); - unsigned int l1 = g1 ^ 1; - - temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1]; - temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3]; - sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; - placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3]; - - g0 = (sq_dist > placeholder); - l0 = g0 ^ 1; - target[0] = g0 * temp0 + l0 * temp1; - */ - -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_GENERIC -static inline void volk_32fc_index_max_aligned16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) { - float sq_dist = 0.0; - float max = 0.0; - unsigned int index = 0; - - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - - sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); - - index = sq_dist > max ? i : index; - max = sq_dist > max ? sq_dist : max; - - - } - target[0] = index; - -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h deleted file mode 100644 index 4e590e120..000000000 --- a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H -#define INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_magnitude_16s_aligned16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, result; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - result = _mm_mul_ps(result, vScalar); - - _mm_store_ps(floatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); - } -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_magnitude_16s_aligned16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - result = _mm_mul_ps(result, vScalar); - - _mm_store_ps(floatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_magnitude_16s_aligned16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_32fc_magnitude_16s_aligned16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_32fc_magnitude_16s_aligned16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - volk_32fc_magnitude_16s_aligned16_orc_impl(magnitudeVector, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_a16.h b/volk/include/volk/volk_32fc_magnitude_32f_a16.h new file mode 100644 index 000000000..be7216dce --- /dev/null +++ b/volk/include/volk/volk_32fc_magnitude_32f_a16.h @@ -0,0 +1,132 @@ +#ifndef INCLUDED_volk_32fc_magnitude_32f_a16_H +#define INCLUDED_volk_32fc_magnitude_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_a16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_32fc_magnitude_32f_a16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +extern void volk_32fc_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); +static inline void volk_32fc_magnitude_32f_a16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + volk_32fc_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32fc_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h deleted file mode 100644 index 3ea62da6a..000000000 --- a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h +++ /dev/null @@ -1,132 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H -#define INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_aligned16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_32fc_magnitude_32f_aligned16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -extern void volk_32fc_magnitude_32f_aligned16_orc_impl(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points); -static inline void volk_32fc_magnitude_32f_aligned16_orc(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - volk_32fc_magnitude_32f_aligned16_orc_impl(magnitudeVector, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_multiply_aligned16.h b/volk/include/volk/volk_32fc_multiply_aligned16.h deleted file mode 100644 index c8f2418c3..000000000 --- a/volk/include/volk/volk_32fc_multiply_aligned16.h +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H -#define INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H - -#include -#include -#include -#include - -#if LV_HAVE_SSE3 -#include - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_multiply_aligned16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_store_ps((float*)c,z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * (*b); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_multiply_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -extern void volk_32fc_multiply_aligned16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points); -static inline void volk_32fc_multiply_aligned16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - static const float mask = -0.0; - volk_32fc_multiply_aligned16_orc_impl(cVector, aVector, bVector, mask, num_points); -} -#endif /* LV_HAVE_ORC */ - - - - - -#endif /* INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h b/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h deleted file mode 100644 index 52ec0f95b..000000000 --- a/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h +++ /dev/null @@ -1,134 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H -#define INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Calculates the log10 power value divided by the RBW for each input point - \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param rbw The resolution bandwith of the fft spectrum - \param num_points The number of fft data points -*/ -static inline void volk_32fc_power_spectral_density_32f_aligned16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ - const float* inputPtr = (const float*)complexFFTInput; - float* destPtr = logPowerOutput; - uint64_t number = 0; - const float iRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; - -#if LV_HAVE_LIB_SIMDMATH - __m128 magScalar = _mm_set_ps1(10.0); - magScalar = _mm_div_ps(magScalar, logf4(magScalar)); - - __m128 invRBW = _mm_set_ps1(iRBW); - - __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); - - __m128 power; - __m128 input1, input2; - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the complex values - input1 =_mm_load_ps(inputPtr); - inputPtr += 4; - input2 =_mm_load_ps(inputPtr); - inputPtr += 4; - - // Apply the normalization factor - input1 = _mm_mul_ps(input1, invNormalizationFactor); - input2 = _mm_mul_ps(input2, invNormalizationFactor); - - // Multiply each value by itself - // (r1*r1), (i1*i1), (r2*r2), (i2*i2) - input1 = _mm_mul_ps(input1, input1); - // (r3*r3), (i3*i3), (r4*r4), (i4*i4) - input2 = _mm_mul_ps(input2, input2); - - // Horizontal add, to add (r*r) + (i*i) for each complex value - // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) - power = _mm_hadd_ps(input1, input2); - - // Divide by the rbw - power = _mm_mul_ps(power, invRBW); - - // Calculate the natural log power - power = logf4(power); - - // Convert to log10 and multiply by 10.0 - power = _mm_mul_ps(power, magScalar); - - // Store the floating point results - _mm_store_ps(destPtr, power); - - destPtr += 4; - } - - number = quarterPoints*4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - // Calculate the FFT for any remaining points - for(; number < num_points; number++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); - destPtr++; - } - -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the log10 power value divided by the RBW for each input point - \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param rbw The resolution bandwith of the fft spectrum - \param num_points The number of fft data points -*/ -static inline void volk_32fc_power_spectral_density_32f_aligned16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ - // Calculate the Power of the complex point - const float* inputPtr = (float*)complexFFTInput; - float* realFFTDataPointsPtr = logPowerOutput; - unsigned int point; - const float invRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; - - for(point = 0; point < num_points; point++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); - - realFFTDataPointsPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h b/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h deleted file mode 100644 index 645629b9d..000000000 --- a/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H -#define INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Calculates the log10 power value for each input point - \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param num_points The number of fft data points -*/ -static inline void volk_32fc_power_spectrum_32f_aligned16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ - const float* inputPtr = (const float*)complexFFTInput; - float* destPtr = logPowerOutput; - uint64_t number = 0; - const float iNormalizationFactor = 1.0 / normalizationFactor; -#if LV_HAVE_LIB_SIMDMATH - __m128 magScalar = _mm_set_ps1(10.0); - magScalar = _mm_div_ps(magScalar, logf4(magScalar)); - - __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); - - __m128 power; - __m128 input1, input2; - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the complex values - input1 =_mm_load_ps(inputPtr); - inputPtr += 4; - input2 =_mm_load_ps(inputPtr); - inputPtr += 4; - - // Apply the normalization factor - input1 = _mm_mul_ps(input1, invNormalizationFactor); - input2 = _mm_mul_ps(input2, invNormalizationFactor); - - // Multiply each value by itself - // (r1*r1), (i1*i1), (r2*r2), (i2*i2) - input1 = _mm_mul_ps(input1, input1); - // (r3*r3), (i3*i3), (r4*r4), (i4*i4) - input2 = _mm_mul_ps(input2, input2); - - // Horizontal add, to add (r*r) + (i*i) for each complex value - // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) - power = _mm_hadd_ps(input1, input2); - - // Calculate the natural log power - power = logf4(power); - - // Convert to log10 and multiply by 10.0 - power = _mm_mul_ps(power, magScalar); - - // Store the floating point results - _mm_store_ps(destPtr, power); - - destPtr += 4; - } - - number = quarterPoints*4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - // Calculate the FFT for any remaining points - - for(; number < num_points; number++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); - - destPtr++; - } - -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the log10 power value for each input point - \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided agains all the input values before the power is calculated - \param num_points The number of fft data points -*/ -static inline void volk_32fc_power_spectrum_32f_aligned16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ - // Calculate the Power of the complex point - const float* inputPtr = (float*)complexFFTInput; - float* realFFTDataPointsPtr = logPowerOutput; - const float iNormalizationFactor = 1.0 / normalizationFactor; - unsigned int point; - for(point = 0; point < num_points; point++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); - - - realFFTDataPointsPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h b/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h new file mode 100644 index 000000000..e9f74438d --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_atan2_32f_a16.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a16_H +#define INCLUDED_volk_32fc_s32f_atan2_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_s32f_atan2_32f_a16_sse4_1(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* outPtr = outputVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + const float invNormalizeFactor = 1.0 / normalizeFactor; + +#if LV_HAVE_LIB_SIMDMATH + __m128 testVector = _mm_set_ps1(2*M_PI); + __m128 correctVector = _mm_set_ps1(M_PI); + __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); + __m128 phase; + __m128 complex1, complex2, iValue, qValue; + __m128 keepMask; + + for (; number < quarterPoints; number++) { + // Load IQ data: + complex1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + complex2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + // Deinterleave IQ data: + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + // Arctan to get phase: + phase = atan2f4(qValue, iValue); + // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. + // Compare to 2pi: + keepMask = _mm_cmpneq_ps(phase,testVector); + phase = _mm_blendv_ps(correctVector, phase, keepMask); + // done with above correction. + phase = _mm_mul_ps(phase, vNormalizeFactor); + _mm_store_ps((float*)outPtr, phase); + outPtr += 4; + } + number = quarterPoints * 4; +#endif /* LV_HAVE_SIMDMATH_H */ + + for (; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_SSE +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The byte-aligned vector where the results will be stored. + \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_s32f_atan2_32f_a16_sse(float* outputVector, const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* outPtr = outputVector; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + const float invNormalizeFactor = 1.0 / normalizeFactor; + +#if LV_HAVE_LIB_SIMDMATH + __m128 testVector = _mm_set_ps1(2*M_PI); + __m128 correctVector = _mm_set_ps1(M_PI); + __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor); + __m128 phase; + __m128 complex1, complex2, iValue, qValue; + __m128 mask; + __m128 keepMask; + + for (; number < quarterPoints; number++) { + // Load IQ data: + complex1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + complex2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + // Deinterleave IQ data: + iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); + // Arctan to get phase: + phase = atan2f4(qValue, iValue); + // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi. + // Compare to 2pi: + keepMask = _mm_cmpneq_ps(phase,testVector); + phase = _mm_and_ps(phase, keepMask); + mask = _mm_andnot_ps(keepMask, correctVector); + phase = _mm_or_ps(phase, mask); + // done with above correction. + phase = _mm_mul_ps(phase, vNormalizeFactor); + _mm_store_ps((float*)outPtr, phase); + outPtr += 4; + } + number = quarterPoints * 4; +#endif /* LV_HAVE_SIMDMATH_H */ + + for (; number < num_points; number++) { + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief performs the atan2 on the input vector and stores the results in the output vector. + \param outputVector The vector where the results will be stored. + \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin). + \param normalizeFactor The atan2 results will be divided by this normalization factor. + \param num_points The number of complex values in the input vector. +*/ +static inline void volk_32fc_s32f_atan2_32f_a16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){ + float* outPtr = outputVector; + const float* inPtr = (float*)inputVector; + const float invNormalizeFactor = 1.0 / normalizeFactor; + unsigned int number; + for ( number = 0; number < num_points; number++) { + const float real = *inPtr++; + const float imag = *inPtr++; + *outPtr++ = atan2f(imag, real) * invNormalizeFactor; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h new file mode 100644 index 000000000..dc3c6741a --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H +#define INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, result; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_32fc_s32f_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_32fc_s32f_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + volk_32fc_s32f_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h new file mode 100644 index 000000000..39d8f7aa2 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_power_spectrum_32f_a16.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H +#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value for each input point + \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_power_spectrum_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iNormalizationFactor = 1.0 / normalizationFactor; +#if LV_HAVE_LIB_SIMDMATH + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the complex values + input1 =_mm_load_ps(inputPtr); + inputPtr += 4; + input2 =_mm_load_ps(inputPtr); + inputPtr += 4; + + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); + + // Store the floating point results + _mm_store_ps(destPtr, power); + + destPtr += 4; + } + + number = quarterPoints*4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); + + destPtr++; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the log10 power value for each input point + \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided agains all the input values before the power is calculated + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_power_spectrum_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){ + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + const float iNormalizationFactor = 1.0 / normalizationFactor; + unsigned int point; + for(point = 0; point < num_points; point++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20); + + + realFFTDataPointsPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h new file mode 100644 index 000000000..29ccdaef7 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h @@ -0,0 +1,134 @@ +#ifndef INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H +#define INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_s32f_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + +#if LV_HAVE_LIB_SIMDMATH + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + + __m128 invRBW = _mm_set_ps1(iRBW); + + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the complex values + input1 =_mm_load_ps(inputPtr); + inputPtr += 4; + input2 =_mm_load_ps(inputPtr); + inputPtr += 4; + + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); + + // Divide by the rbw + power = _mm_mul_ps(power, invRBW); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); + + // Store the floating point results + _mm_store_ps(destPtr, power); + + destPtr += 4; + } + + number = quarterPoints*4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); + destPtr++; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_s32f_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + unsigned int point; + const float invRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + + for(point = 0; point < num_points; point++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); + + realFFTDataPointsPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_square_dist_aligned16.h b/volk/include/volk/volk_32fc_square_dist_aligned16.h deleted file mode 100644 index 6458ea4dd..000000000 --- a/volk/include/volk/volk_32fc_square_dist_aligned16.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H -#define INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -#include - -static inline void volk_32fc_square_dist_aligned16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { - - - __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - lv_32fc_t diff; - float sq_dist; - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); - xmm2 = _mm_load_ps((float*)&points[0]); - xmm1 = _mm_movelh_ps(xmm1, xmm1); - xmm3 = _mm_load_ps((float*)&points[2]); - - - for(; i < bound - 1; ++i) { - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm3 = _mm_load_ps((float*)&points[2]); - - _mm_store_ps(target, xmm4); - - target += 4; - - } - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - - - - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - _mm_store_ps(target, xmm4); - - target += 4; - - for(i = 0; i < leftovers0; ++i) { - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_sub_ps(xmm1, xmm2); - - points += 2; - - xmm6 = _mm_mul_ps(xmm4, xmm4); - - xmm4 = _mm_hadd_ps(xmm6, xmm6); - - _mm_storeh_pi((__m64*)target, xmm4); - - target += 2; - } - - for(i = 0; i < leftovers1; ++i) { - - diff = src0[0] - points[0]; - - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - - target[0] = sq_dist; - } -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_GENERIC -static inline void volk_32fc_square_dist_aligned16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { - lv_32fc_t diff; - float sq_dist; - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - diff = src0[0] - points[i]; - - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - - target[i] = sq_dist; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h b/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h deleted file mode 100644 index 0fcc86f1e..000000000 --- a/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H -#define INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H - -#include -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -#include - -static inline void volk_32fc_square_dist_scalar_mult_aligned16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { - - - __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - - lv_32fc_t diff; - memset(&diff, 0x0, 2*sizeof(float)); - - float sq_dist = 0.0; - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - - - xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); - xmm2 = _mm_load_ps((float*)&points[0]); - xmm8 = _mm_load1_ps(&scalar); - xmm1 = _mm_movelh_ps(xmm1, xmm1); - xmm3 = _mm_load_ps((float*)&points[2]); - - - for(; i < bound - 1; ++i) { - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm3 = _mm_load_ps((float*)&points[2]); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_store_ps(target, xmm4); - - target += 4; - - } - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - - - - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_store_ps(target, xmm4); - - target += 4; - - - for(i = 0; i < leftovers0; ++i) { - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_sub_ps(xmm1, xmm2); - - points += 2; - - xmm6 = _mm_mul_ps(xmm4, xmm4); - - xmm4 = _mm_hadd_ps(xmm6, xmm6); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_storeh_pi((__m64*)target, xmm4); - - target += 2; - } - - for(i = 0; i < leftovers1; ++i) { - - diff = src0[0] - points[0]; - - sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - - target[0] = sq_dist; - } -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_GENERIC -static inline void volk_32fc_square_dist_scalar_mult_aligned16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { - lv_32fc_t diff; - float sq_dist; - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - diff = src0[0] - points[i]; - - sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - - target[i] = sq_dist; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_32s_32s_and_32s_a16.h b/volk/include/volk/volk_32s_32s_and_32s_a16.h new file mode 100644 index 000000000..0e8380757 --- /dev/null +++ b/volk/include/volk/volk_32s_32s_and_32s_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32s_32s_and_32s_a16_H +#define INCLUDED_volk_32s_32s_and_32s_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32s_32s_and_32s_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_and_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] & bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32s_32s_and_32s_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) & (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +extern void volk_32s_32s_and_32s_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32s_32s_and_32s_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32s_32s_and_32s_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32s_32s_and_32s_a16_H */ diff --git a/volk/include/volk/volk_32s_32s_or_32s_a16.h b/volk/include/volk/volk_32s_32s_or_32s_a16.h new file mode 100644 index 000000000..2dcf2e551 --- /dev/null +++ b/volk/include/volk/volk_32s_32s_or_32s_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32s_32s_or_32s_a16_H +#define INCLUDED_volk_32s_32s_or_32s_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32s_32s_or_32s_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_or_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] | bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32s_32s_or_32s_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) | (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +extern void volk_32s_32s_or_32s_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32s_32s_or_32s_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32s_32s_or_32s_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32s_32s_or_32s_a16_H */ diff --git a/volk/include/volk/volk_32s_and_aligned16.h b/volk/include/volk/volk_32s_and_aligned16.h deleted file mode 100644 index 16c63fd48..000000000 --- a/volk/include/volk/volk_32s_and_aligned16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_VOLK_32s_AND_ALIGNED16_H -#define INCLUDED_VOLK_32s_AND_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -static inline void volk_32s_and_aligned16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = (float*)cVector; - const float* aPtr = (float*)aVector; - const float* bPtr = (float*)bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_and_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - cVector[number] = aVector[number] & bVector[number]; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -static inline void volk_32s_and_aligned16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) & (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -extern void volk_32s_and_aligned16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32s_and_aligned16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32s_and_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32s_AND_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_convert_32f_aligned16.h b/volk/include/volk/volk_32s_convert_32f_aligned16.h deleted file mode 100644 index a407e68bd..000000000 --- a/volk/include/volk/volk_32s_convert_32f_aligned16.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H -#define INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32s_convert_32f_aligned16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; - - for(;number < quarterPoints; number++){ - - // Load the 4 values - inputVal = _mm_load_si128((__m128i*)inputPtr); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - - _mm_store_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - inputPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } -} -#endif /* LV_HAVE_SSE2 */ - - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32s_convert_32f_aligned16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_convert_32f_unaligned16.h b/volk/include/volk/volk_32s_convert_32f_unaligned16.h deleted file mode 100644 index ad7d4eb17..000000000 --- a/volk/include/volk/volk_32s_convert_32f_unaligned16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H -#define INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_32s_convert_32f_unaligned16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; - - for(;number < quarterPoints; number++){ - - // Load the 4 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - inputPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } -} -#endif /* LV_HAVE_SSE2 */ - - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_32s_convert_32f_unaligned16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_or_aligned16.h b/volk/include/volk/volk_32s_or_aligned16.h deleted file mode 100644 index 64748d535..000000000 --- a/volk/include/volk/volk_32s_or_aligned16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_VOLK_32s_OR_ALIGNED16_H -#define INCLUDED_VOLK_32s_OR_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -static inline void volk_32s_or_aligned16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = (float*)cVector; - const float* aPtr = (float*)aVector; - const float* bPtr = (float*)bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_or_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - cVector[number] = aVector[number] | bVector[number]; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -static inline void volk_32s_or_aligned16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) | (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -extern void volk_32s_or_aligned16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32s_or_aligned16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32s_or_aligned16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_VOLK_32s_OR_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32s_s32f_convert_32f_a16.h b/volk/include/volk/volk_32s_s32f_convert_32f_a16.h new file mode 100644 index 000000000..c16ecc9dd --- /dev/null +++ b/volk/include/volk/volk_32s_s32f_convert_32f_a16.h @@ -0,0 +1,73 @@ +#ifndef INCLUDED_volk_32s_s32f_convert_32f_a16_H +#define INCLUDED_volk_32s_s32f_convert_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32s_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_load_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_store_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32s_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32s_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h new file mode 100644 index 000000000..4eb5a5b85 --- /dev/null +++ b/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32s_s32f_convert_32f_ua16_H +#define INCLUDED_volk_32s_s32f_convert_32f_ua16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32s_s32f_convert_32f_ua16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32s_s32f_convert_32f_ua16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32s_s32f_convert_32f_ua16_H */ diff --git a/volk/include/volk/volk_32u_byteswap_a16.h b/volk/include/volk/volk_32u_byteswap_a16.h new file mode 100644 index 000000000..7556ec7b1 --- /dev/null +++ b/volk/include/volk/volk_32u_byteswap_a16.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_32u_byteswap_a16_H +#define INCLUDED_volk_32u_byteswap_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_a16_sse2(uint32_t* intsToSwap, unsigned int num_points){ + unsigned int number = 0; + + uint32_t* inputPtr = intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = quarterPoints*4; + for(; number < num_points; number++){ + uint32_t outputVal = *inputPtr; + outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); + *inputPtr = outputVal; + inputPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int32_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_32u_byteswap_a16_generic(uint32_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = intsToSwap; + + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output = *inputPtr; + output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); + + *inputPtr = output; + inputPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_32u_byteswap_aligned16.h b/volk/include/volk/volk_32u_byteswap_aligned16.h deleted file mode 100644 index 09173a9d5..000000000 --- a/volk/include/volk/volk_32u_byteswap_aligned16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H -#define INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - -/*! - \brief Byteswaps (in-place) an aligned vector of int32_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_32u_byteswap_aligned16_sse2(uint32_t* intsToSwap, unsigned int num_points){ - unsigned int number = 0; - - uint32_t* inputPtr = intsToSwap; - __m128i input, byte1, byte2, byte3, byte4, output; - __m128i byte2mask = _mm_set1_epi32(0x00FF0000); - __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 4; - } - - // Byteswap any remaining points: - number = quarterPoints*4; - for(; number < num_points; number++){ - uint32_t outputVal = *inputPtr; - outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000)); - *inputPtr = outputVal; - inputPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int32_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_32u_byteswap_aligned16_generic(uint32_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = intsToSwap; - - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output = *inputPtr; - output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000)); - - *inputPtr = output; - inputPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H */ diff --git a/volk/include/volk/volk_32u_popcnt_a16.h b/volk/include/volk/volk_32u_popcnt_a16.h new file mode 100644 index 000000000..f6e25e4e8 --- /dev/null +++ b/volk/include/volk/volk_32u_popcnt_a16.h @@ -0,0 +1,36 @@ +#ifndef INCLUDED_VOLK_32u_POPCNT_A16_H +#define INCLUDED_VOLK_32u_POPCNT_A16_H + +#include +#include + + +#if LV_HAVE_GENERIC + +static inline void volk_32u_popcnt_a16_generic(uint32_t* ret, const uint32_t value) { + + // This is faster than a lookup table + uint32_t retVal = value; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + + *ret = retVal; +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE4_2 + +#include + +static inline void volk_32u_popcnt_a16_sse4_2(uint32_t* ret, const uint32_t value) { + *ret = _mm_popcnt_u32(value); +} + +#endif /*LV_HAVE_SSE4_2*/ + +#endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/ diff --git a/volk/include/volk/volk_32u_popcnt_aligned16.h b/volk/include/volk/volk_32u_popcnt_aligned16.h deleted file mode 100644 index 37cfd112c..000000000 --- a/volk/include/volk/volk_32u_popcnt_aligned16.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H -#define INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H - -#include -#include - - -#if LV_HAVE_GENERIC - -static inline void volk_32u_popcnt_aligned16_generic(uint32_t* ret, const uint32_t value) { - - // This is faster than a lookup table - uint32_t retVal = value; - - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - - *ret = retVal; -} - -#endif /*LV_HAVE_GENERIC*/ - -#if LV_HAVE_SSE4_2 - -#include - -static inline void volk_32u_popcnt_aligned16_sse4_2(uint32_t* ret, const uint32_t value) { - *ret = _mm_popcnt_u32(value); -} - -#endif /*LV_HAVE_SSE4_2*/ - -#endif /*INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_64f_64f_max_64f_a16.h b/volk/include/volk/volk_64f_64f_max_64f_a16.h new file mode 100644 index 000000000..7e091851f --- /dev/null +++ b/volk/include/volk/volk_64f_64f_max_64f_a16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_64f_64f_max_64f_a16_H +#define INCLUDED_volk_64f_64f_max_64f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_64f_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_max_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_64f_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_64f_64f_max_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_64f_min_64f_a16.h b/volk/include/volk/volk_64f_64f_min_64f_a16.h new file mode 100644 index 000000000..f2bcbe83b --- /dev/null +++ b/volk/include/volk/volk_64f_64f_min_64f_a16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_64f_64f_min_64f_a16_H +#define INCLUDED_volk_64f_64f_min_64f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_64f_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_min_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_64f_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_64f_64f_min_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_a16.h b/volk/include/volk/volk_64f_convert_32f_a16.h new file mode 100644 index 000000000..7dca065f0 --- /dev/null +++ b/volk/include/volk/volk_64f_convert_32f_a16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_volk_64f_convert_32f_a16_H +#define INCLUDED_volk_64f_convert_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_a16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_a16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_aligned16.h b/volk/include/volk/volk_64f_convert_32f_aligned16.h deleted file mode 100644 index 44df66104..000000000 --- a/volk/include/volk/volk_64f_convert_32f_aligned16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H -#define INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_64f_convert_32f_aligned16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret, ret2; - __m128d inputVal1, inputVal2; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2; - - ret = _mm_cvtpd_ps(inputVal1); - ret2 = _mm_cvtpd_ps(inputVal2); - - ret = _mm_movelh_ps(ret, ret2); - - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_64f_convert_32f_aligned16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const double* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_ua16.h b/volk/include/volk/volk_64f_convert_32f_ua16.h new file mode 100644 index 000000000..7774db1b7 --- /dev/null +++ b/volk/include/volk/volk_64f_convert_32f_ua16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_volk_64f_convert_32f_ua16_H +#define INCLUDED_volk_64f_convert_32f_ua16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_ua16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_ua16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64f_convert_32f_ua16_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_unaligned16.h b/volk/include/volk/volk_64f_convert_32f_unaligned16.h deleted file mode 100644 index 08cfb6127..000000000 --- a/volk/include/volk/volk_64f_convert_32f_unaligned16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H -#define INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_64f_convert_32f_unaligned16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret, ret2; - __m128d inputVal1, inputVal2; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - - ret = _mm_cvtpd_ps(inputVal1); - ret2 = _mm_cvtpd_ps(inputVal2); - - ret = _mm_movelh_ps(ret, ret2); - - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_64f_convert_32f_unaligned16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const double* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H */ diff --git a/volk/include/volk/volk_64f_max_aligned16.h b/volk/include/volk/volk_64f_max_aligned16.h deleted file mode 100644 index ce4907a8c..000000000 --- a/volk/include/volk/volk_64f_max_aligned16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_VOLK_64f_MAX_ALIGNED16_H -#define INCLUDED_VOLK_64f_MAX_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_max_aligned16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - - __m128d aVal, bVal, cVal; - for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); - - cVal = _mm_max_pd(aVal, bVal); - - _mm_store_pd(cPtr,cVal); // Store the results back into the C container - - aPtr += 2; - bPtr += 2; - cPtr += 2; - } - - number = halfPoints * 2; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_max_aligned16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_VOLK_64f_MAX_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64f_min_aligned16.h b/volk/include/volk/volk_64f_min_aligned16.h deleted file mode 100644 index acf4d6b2a..000000000 --- a/volk/include/volk/volk_64f_min_aligned16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_VOLK_64f_MIN_ALIGNED16_H -#define INCLUDED_VOLK_64f_MIN_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_min_aligned16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - - __m128d aVal, bVal, cVal; - for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); - - cVal = _mm_min_pd(aVal, bVal); - - _mm_store_pd(cPtr,cVal); // Store the results back into the C container - - aPtr += 2; - bPtr += 2; - cPtr += 2; - } - - number = halfPoints * 2; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_min_aligned16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_VOLK_64f_MIN_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64u_byteswap_a16.h b/volk/include/volk/volk_64u_byteswap_a16.h new file mode 100644 index 000000000..0eefe0138 --- /dev/null +++ b/volk/include/volk/volk_64u_byteswap_a16.h @@ -0,0 +1,88 @@ +#ifndef INCLUDED_volk_64u_byteswap_a16_H +#define INCLUDED_volk_64u_byteswap_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_a16_sse2(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + uint64_t number = 0; + const unsigned int halfPoints = num_points / 2; + for(;number < halfPoints; number++){ + // Load the 32t values, increment inputPtr later since we're doing it in-place. + input = _mm_load_si128((__m128i*)inputPtr); + + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + + // Reorder the two words + output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); + + // Store the results + _mm_store_si128((__m128i*)inputPtr, output); + inputPtr += 4; + } + + // Byteswap any remaining points: + number = halfPoints*2; + for(; number < num_points; number++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Byteswaps (in-place) an aligned vector of int64_t's. + \param intsToSwap The vector of data to byte swap + \param numDataPoints The number of data points +*/ +static inline void volk_64u_byteswap_a16_generic(uint64_t* intsToSwap, unsigned int num_points){ + uint32_t* inputPtr = (uint32_t*)intsToSwap; + unsigned int point; + for(point = 0; point < num_points; point++){ + uint32_t output1 = *inputPtr; + uint32_t output2 = inputPtr[1]; + + output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); + + output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); + + *inputPtr++ = output2; + *inputPtr++ = output1; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64u_byteswap_a16_H */ diff --git a/volk/include/volk/volk_64u_byteswap_aligned16.h b/volk/include/volk/volk_64u_byteswap_aligned16.h deleted file mode 100644 index d5e1b6f30..000000000 --- a/volk/include/volk/volk_64u_byteswap_aligned16.h +++ /dev/null @@ -1,88 +0,0 @@ -#ifndef INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H -#define INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - -/*! - \brief Byteswaps (in-place) an aligned vector of int64_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_64u_byteswap_aligned16_sse2(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - __m128i input, byte1, byte2, byte3, byte4, output; - __m128i byte2mask = _mm_set1_epi32(0x00FF0000); - __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - uint64_t number = 0; - const unsigned int halfPoints = num_points / 2; - for(;number < halfPoints; number++){ - // Load the 32t values, increment inputPtr later since we're doing it in-place. - input = _mm_load_si128((__m128i*)inputPtr); - - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - // Or bytes together - output = _mm_or_si128(byte1, byte4); - byte2 = _mm_and_si128(byte2, byte2mask); - output = _mm_or_si128(output, byte2); - byte3 = _mm_and_si128(byte3, byte3mask); - output = _mm_or_si128(output, byte3); - - // Reorder the two words - output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); - - // Store the results - _mm_store_si128((__m128i*)inputPtr, output); - inputPtr += 4; - } - - // Byteswap any remaining points: - number = halfPoints*2; - for(; number < num_points; number++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Byteswaps (in-place) an aligned vector of int64_t's. - \param intsToSwap The vector of data to byte swap - \param numDataPoints The number of data points -*/ -static inline void volk_64u_byteswap_aligned16_generic(uint64_t* intsToSwap, unsigned int num_points){ - uint32_t* inputPtr = (uint32_t*)intsToSwap; - unsigned int point; - for(point = 0; point < num_points; point++){ - uint32_t output1 = *inputPtr; - uint32_t output2 = inputPtr[1]; - - output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000)); - - output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000)); - - *inputPtr++ = output2; - *inputPtr++ = output1; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H */ diff --git a/volk/include/volk/volk_64u_popcnt_a16.h b/volk/include/volk/volk_64u_popcnt_a16.h new file mode 100644 index 000000000..59511dc29 --- /dev/null +++ b/volk/include/volk/volk_64u_popcnt_a16.h @@ -0,0 +1,50 @@ +#ifndef INCLUDED_volk_64u_popcnt_a16_H +#define INCLUDED_volk_64u_popcnt_a16_H + +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_64u_popcnt_a16_generic(uint64_t* ret, const uint64_t value) { + + const uint32_t* valueVector = (const uint32_t*)&value; + + // This is faster than a lookup table + uint32_t retVal = valueVector[0]; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + uint64_t retVal64 = retVal; + + retVal = valueVector[1]; + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + retVal64 += retVal; + + *ret = retVal64; + +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE4_2 && LV_HAVE_64 + +#include + +static inline void volk_64u_popcnt_a16_sse4_2(uint64_t* ret, const uint64_t value) { + *ret = _mm_popcnt_u64(value); + +} + +#endif /*LV_HAVE_SSE4_2*/ + +#endif /*INCLUDED_volk_64u_popcnt_a16_H*/ diff --git a/volk/include/volk/volk_64u_popcnt_aligned16.h b/volk/include/volk/volk_64u_popcnt_aligned16.h deleted file mode 100644 index 4d62f9375..000000000 --- a/volk/include/volk/volk_64u_popcnt_aligned16.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H -#define INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H - -#include -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_64u_popcnt_aligned16_generic(uint64_t* ret, const uint64_t value) { - - const uint32_t* valueVector = (const uint32_t*)&value; - - // This is faster than a lookup table - uint32_t retVal = valueVector[0]; - - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - uint64_t retVal64 = retVal; - - retVal = valueVector[1]; - retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); - retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); - retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; - retVal = (retVal + (retVal >> 8)); - retVal = (retVal + (retVal >> 16)) & 0x0000003F; - retVal64 += retVal; - - *ret = retVal64; - -} - -#endif /*LV_HAVE_GENERIC*/ - -#if LV_HAVE_SSE4_2 && LV_HAVE_64 - -#include - -static inline void volk_64u_popcnt_aligned16_sse4_2(uint64_t* ret, const uint64_t value) { - *ret = _mm_popcnt_u64(value); - -} - -#endif /*LV_HAVE_SSE4_2*/ - -#endif /*INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H*/ diff --git a/volk/include/volk/volk_8s_convert_16s_a16.h b/volk/include/volk/volk_8s_convert_16s_a16.h new file mode 100644 index 000000000..38efdb6a3 --- /dev/null +++ b/volk/include/volk/volk_8s_convert_16s_a16.h @@ -0,0 +1,83 @@ +#ifndef INCLUDED_volk_8s_convert_16s_a16_H +#define INCLUDED_volk_8s_convert_16s_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_convert_16s_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_convert_16s_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +extern void volk_8s_convert_16s_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); +static inline void volk_8s_convert_16s_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + volk_8s_convert_16s_a16_orc_impl(outputVector, inputVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_16s_aligned16.h b/volk/include/volk/volk_8s_convert_16s_aligned16.h deleted file mode 100644 index c52c64eae..000000000 --- a/volk/include/volk/volk_8s_convert_16s_aligned16.h +++ /dev/null @@ -1,83 +0,0 @@ -#ifndef INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED16_H -#define INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_convert_16s_aligned16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m128i* outputVectorPtr = (__m128i*)outputVector; - __m128i inputVal; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128(inputVectorPtr); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_store_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVal = _mm_srli_si128(inputVal, 8); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_store_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVectorPtr++; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_convert_16s_aligned16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -extern void volk_8s_convert_16s_aligned16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); -static inline void volk_8s_convert_16s_aligned16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - volk_8s_convert_16s_aligned16_orc_impl(outputVector, inputVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_16s_ua16.h b/volk/include/volk/volk_8s_convert_16s_ua16.h new file mode 100644 index 000000000..a726bfb5e --- /dev/null +++ b/volk/include/volk/volk_8s_convert_16s_ua16.h @@ -0,0 +1,73 @@ +#ifndef INCLUDED_volk_8s_convert_16s_ua16_H +#define INCLUDED_volk_8s_convert_16s_ua16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8s_convert_16s_ua16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8s_convert_16s_ua16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_16s_unaligned16.h b/volk/include/volk/volk_8s_convert_16s_unaligned16.h deleted file mode 100644 index 05b916cea..000000000 --- a/volk/include/volk/volk_8s_convert_16s_unaligned16.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED16_H -#define INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned - */ -static inline void volk_8s_convert_16s_unaligned16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m128i* outputVectorPtr = (__m128i*)outputVector; - __m128i inputVal; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128(inputVectorPtr); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_storeu_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVal = _mm_srli_si128(inputVal, 8); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_storeu_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVectorPtr++; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned - */ -static inline void volk_8s_convert_16s_unaligned16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_32f_aligned16.h b/volk/include/volk/volk_8s_convert_32f_aligned16.h deleted file mode 100644 index 700a0fa42..000000000 --- a/volk/include/volk/volk_8s_convert_32f_aligned16.h +++ /dev/null @@ -1,105 +0,0 @@ -#ifndef INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED16_H -#define INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_convert_32f_aligned16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - const int8_t* inputVectorPtr = inputVector; - __m128 ret; - __m128i inputVal; - __m128i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128((__m128i*)inputVectorPtr); - - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_convert_32f_aligned16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -extern void volk_8s_convert_32f_aligned16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); -static inline void volk_8s_convert_32f_aligned16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - volk_8s_convert_32f_aligned16_orc_impl(outputVector, inputVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_32f_unaligned16.h b/volk/include/volk/volk_8s_convert_32f_unaligned16.h deleted file mode 100644 index 8019aac9a..000000000 --- a/volk/include/volk/volk_8s_convert_32f_unaligned16.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED16_H -#define INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_8s_convert_32f_unaligned16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1( iScalar ); - const int8_t* inputVectorPtr = inputVector; - __m128 ret; - __m128i inputVal; - __m128i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); - - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_8s_convert_32f_unaligned16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_s32f_convert_32f_a16.h b/volk/include/volk/volk_8s_s32f_convert_32f_a16.h new file mode 100644 index 000000000..45185ac2e --- /dev/null +++ b/volk/include/volk/volk_8s_s32f_convert_32f_a16.h @@ -0,0 +1,105 @@ +#ifndef INCLUDED_volk_8s_s32f_convert_32f_a16_H +#define INCLUDED_volk_8s_s32f_convert_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8s_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +extern void volk_8s_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); +static inline void volk_8s_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + volk_8s_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h new file mode 100644 index 000000000..310824580 --- /dev/null +++ b/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h @@ -0,0 +1,94 @@ +#ifndef INCLUDED_volk_8s_s32f_convert_32f_ua16_H +#define INCLUDED_volk_8s_s32f_convert_32f_ua16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1( iScalar ); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8s_s32f_convert_32f_ua16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h b/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h new file mode 100644 index 000000000..eae1185ec --- /dev/null +++ b/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h @@ -0,0 +1,102 @@ +#ifndef INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H +#define INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_8sc_multiply_conjugate_16sc_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + lv_16sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + const int shuffleMask = _MM_SHUFFLE(2,3,0,1); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + int16_t* c16Ptr = (int16_t*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_8sc_multiply_conjugate_16sc_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + int16_t* c16Ptr = (int16_t*)cVector; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number =0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H */ diff --git a/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h new file mode 100644 index 000000000..621276b08 --- /dev/null +++ b/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H +#define INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + __m128 ret; + lv_32fc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + const int shuffleMask = _MM_SHUFFLE(2,3,0,1); + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + a += 4; + b += 4; + } + + number = quarterPoints * 4; + float* cFloatPtr = (float*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cFloatPtr++ = lv_creal(temp) / scalar; + *cFloatPtr++ = lv_cimag(temp) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* cPtr = (float*)cVector; + const float invScalar = 1.0 / scalar; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number = 0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cPtr++ = (lv_creal(temp) * invScalar); + *cPtr++ = (lv_cimag(temp) * invScalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h new file mode 100644 index 000000000..6a35e969d --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H +#define INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_16s_16s_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + __m128i complexVal, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iOutputVal = _mm_cvtepi8_epi16(iOutputVal); + iOutputVal = _mm_slli_epi16(iOutputVal, 8); + + qOutputVal = _mm_cvtepi8_epi16(qOutputVal); + qOutputVal = _mm_slli_epi16(qOutputVal, 8); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h deleted file mode 100644 index 38eaa49ea..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H -#define INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_16s_aligned16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - __m128i complexVal, iOutputVal, qOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); - qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); - - iOutputVal = _mm_cvtepi8_epi16(iOutputVal); - iOutputVal = _mm_slli_epi16(iOutputVal, 8); - - qOutputVal = _mm_cvtepi8_epi16(qOutputVal); - qOutputVal = _mm_slli_epi16(qOutputVal, 8); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; - *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h deleted file mode 100644 index d0c118965..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h +++ /dev/null @@ -1,164 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H -#define INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_32f_aligned16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - __m128 iFloatValue, qFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - - for(;number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); - qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); - - iIntVal = _mm_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - _mm_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 4; - - iComplexVal = _mm_srli_si128(iComplexVal, 4); - - iIntVal = _mm_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - _mm_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 4; - - qIntVal = _mm_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm_cvtepi32_ps(qIntVal); - qFloatValue = _mm_mul_ps(qFloatValue, invScalar); - _mm_store_ps(qBufferPtr, qFloatValue); - qBufferPtr += 4; - - qComplexVal = _mm_srli_si128(qComplexVal, 4); - - qIntVal = _mm_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm_cvtepi32_ps(qIntVal); - qFloatValue = _mm_mul_ps(qFloatValue, invScalar); - _mm_store_ps(qBufferPtr, qFloatValue); - - qBufferPtr += 4; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int8_t* complexVectorPtr = (int8_t*)complexVector; - - float floatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(complexVectorPtr[0]); - floatBuffer[1] = (float)(complexVectorPtr[1]); - floatBuffer[2] = (float)(complexVectorPtr[2]); - floatBuffer[3] = (float)(complexVectorPtr[3]); - - floatBuffer[4] = (float)(complexVectorPtr[4]); - floatBuffer[5] = (float)(complexVectorPtr[5]); - floatBuffer[6] = (float)(complexVectorPtr[6]); - floatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&floatBuffer[0]); - cplxValue2 = _mm_load_ps(&floatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int8_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h new file mode 100644 index 000000000..67ffebd99 --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_8sc_deinterleave_real_16s_a16_H +#define INCLUDED_volk_8sc_deinterleave_real_16s_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_16s_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i complexVal, outputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + outputVal = _mm_cvtepi8_epi16(complexVal); + outputVal = _mm_slli_epi16(outputVal, 7); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8sc_deinterleave_real_16s_a16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h deleted file mode 100644 index d0cb49494..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H -#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_16s_aligned16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i complexVal, outputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - outputVal = _mm_cvtepi8_epi16(complexVal); - outputVal = _mm_slli_epi16(outputVal, 7); - - _mm_store_si128((__m128i*)iBufferPtr, outputVal); - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE4_1 */ - - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h deleted file mode 100644 index c849448ea..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h +++ /dev/null @@ -1,133 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H -#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_32f_aligned16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - __m128 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - - for(;number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - iIntVal = _mm_cvtepi8_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - - complexVal = _mm_srli_si128(complexVal, 4); - iIntVal = _mm_cvtepi8_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 iValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int8_t* complexVectorPtr = (int8_t*)complexVector; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; - - iValue = _mm_load_ps(floatBuffer); - - iValue = _mm_mul_ps(iValue, invScalar); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - float* iBufferPtr = iBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h new file mode 100644 index 000000000..ecffc092e --- /dev/null +++ b/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, outputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); + + outputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h deleted file mode 100644 index d84d64568..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H -#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_8s_aligned16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, outputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); - - outputVal = _mm_or_si128(complexVal1, complexVal2); - - _mm_store_si128((__m128i*)iBufferPtr, outputVal); - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h b/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h deleted file mode 100644 index 470a67539..000000000 --- a/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H -#define INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_multiply_conjugate_16sc_aligned16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128i x, y, realz, imagz; - lv_16sc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - const int shuffleMask = _MM_SHUFFLE(2,3,0,1); - - for(;number < quarterPoints; number++){ - // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm_madd_epi16(x,y); - - _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); - - a += 4; - b += 4; - c += 4; - } - - number = quarterPoints * 4; - int16_t* c16Ptr = (int16_t*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_multiply_conjugate_16sc_aligned16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - int16_t* c16Ptr = (int16_t*)cVector; - int8_t* a8Ptr = (int8_t*)aVector; - int8_t* b8Ptr = (int8_t*)bVector; - for(number =0; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h b/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h deleted file mode 100644 index 52b444cf7..000000000 --- a/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H -#define INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_multiply_conjugate_32fc_aligned16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128i x, y, realz, imagz; - __m128 ret; - lv_32fc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - const int shuffleMask = _MM_SHUFFLE(2,3,0,1); - __m128 invScalar = _mm_set_ps1(1.0/scalar); - - for(;number < quarterPoints; number++){ - // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm_madd_epi16(x,y); - - // Interleave real and imaginary and then convert to float values - ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); - - // Normalize the floating point values - ret = _mm_mul_ps(ret, invScalar); - - // Store the floating point values - _mm_store_ps((float*)c, ret); - c += 2; - - // Interleave real and imaginary and then convert to float values - ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); - - // Normalize the floating point values - ret = _mm_mul_ps(ret, invScalar); - - // Store the floating point values - _mm_store_ps((float*)c, ret); - c += 2; - - a += 4; - b += 4; - } - - number = quarterPoints * 4; - float* cFloatPtr = (float*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cFloatPtr++ = lv_creal(temp) / scalar; - *cFloatPtr++ = lv_cimag(temp) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_multiply_conjugate_32fc_aligned16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* cPtr = (float*)cVector; - const float invScalar = 1.0 / scalar; - int8_t* a8Ptr = (int8_t*)aVector; - int8_t* b8Ptr = (int8_t*)bVector; - for(number = 0; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cPtr++ = (lv_creal(temp) * invScalar); - *cPtr++ = (lv_cimag(temp) * invScalar); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H */ diff --git a/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h new file mode 100644 index 000000000..cedbf202c --- /dev/null +++ b/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h @@ -0,0 +1,164 @@ +#ifndef INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H +#define INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue, qFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + iComplexVal = _mm_srli_si128(iComplexVal, 4); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + qBufferPtr += 4; + + qComplexVal = _mm_srli_si128(qComplexVal, 4); + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + + qBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + float floatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int8_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h new file mode 100644 index 000000000..902795131 --- /dev/null +++ b/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h @@ -0,0 +1,133 @@ +#ifndef INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H +#define INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + + complexVal = _mm_srli_si128(complexVal, 4); + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index 9fded9a3e..fc1ec10ef 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -55,7 +55,7 @@ functions = []; for line in mfile: - subline = re.search(".*(aligned).*", line); + subline = re.search(".*(a16).*", line); if subline: subsubline = re.search("(?<=volk_).*", subline.group(0)); if subsubline: @@ -70,11 +70,10 @@ datatypes = set(datatypes); for line in mfile: for dt in datatypes: if dt in line: - subline = re.search("(volk_" + dt +"_.*(aligned).*\.h)", line); + subline = re.search("(volk_" + dt +"_.*(a16).*\.h)", line); if subline: subsubline = re.search(".+(?=\.h)", subline.group(0)); - functions.append(subsubline.group(0)); archs = []; diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 5c995148a..f609f5bf9 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -184,7 +184,8 @@ testqa_CPPFLAGS = -DBOOST_TEST_DYN_LINK -DBOOST_TEST_MAIN testqa_LDFLAGS = -lboost_unit_test_framework testqa_LDADD = \ libvolk.la \ - libvolk_runtime.la + libvolk_runtime.la \ + ../orc/libvolk_orc.la distclean-local: diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index fa21db487..a8c00c143 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -110,7 +110,11 @@ static std::vector get_arch_list(const int archs[]) { } static bool is_valid_type(std::string type) { - std::vector valid_types = boost::assign::list_of("64f")("64u")("32fc")("32f")("32s")("32u")("16sc")("16s")("16u")("8s")("8sc")("8u"); + std::vector valid_types = boost::assign::list_of("64f")("64u")("32fc")("32f") + ("32s")("32u")("16sc")("16s") + ("16u")("8s")("8sc")("8u") + ("s32f")("s16u")("s16s")("s8u") + ("s8s"); BOOST_FOREACH(std::string this_type, valid_types) { if(type == this_type) return true; @@ -148,17 +152,11 @@ static void get_function_signature(std::vector &inputsig, pos++; } - //if there's no output sig and only one input sig, assume there are 2 inputs - //this handles conversion fn's (which have a specified output sig) and most of the rest - if(outputsig.size() == 0 && inputsig.size() == 1) { - outputsig.push_back(inputsig[0]); - inputsig.push_back(inputsig[0]); - }//if there's no explicit output sig then assume the output is the same as the first input - else if(outputsig.size() == 0) outputsig.push_back(inputsig[0]); - - assert(inputsig.size() != 0); - assert(outputsig.size() != 0); +} + +inline void run_cast_test1(volk_fn_1arg func, void *buff, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buff, vlen, arch.c_str()); } inline void run_cast_test2(volk_fn_2arg func, void *outbuff, std::vector &inbuffs, unsigned int vlen, unsigned int iter, std::string arch) { @@ -190,26 +188,42 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, for(int i=0; i inbuffs; make_buffer_for_signature(inbuffs, inputsig, vlen); + //allocate output buffers -- one for each output for each arch + std::vector outbuffs; + BOOST_FOREACH(std::string arch, arch_list) { + make_buffer_for_signature(outbuffs, outputsig, vlen); + } + //and set the input buffers to something random for(int i=0; i outbuffs; - BOOST_FOREACH(std::string arch, arch_list) { - make_buffer_for_signature(outbuffs, outputsig, vlen); + //so let's see here. if the operation has no output sig, it operates in place, + //and we want the output buffers to be the input buffers; we want to copy the input buffer to allllll the output buffers. + if(outputsig.size() == 0) { + //make a set of output buffers according to the input signature + BOOST_FOREACH(std::string arch, arch_list) { + make_buffer_for_signature(outbuffs, inputsig, vlen); + } + //copy input buffer[0] to all the output buffers so it has something to operate on + //output buffer element size is the same as input buffer[0] + if( } + //now run the test clock_t start, end; for(int i = 0; i < arch_list.size(); i++) { start = clock(); switch(outputsig.size()+inputsig.size()) { + case 1: + run_cast_test1((volk_fn_1arg)(manual_func), outbuffs[i], vlen, iter, arch_list[i]); + break; case 2: run_cast_test2((volk_fn_2arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); break; @@ -262,6 +276,13 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, return 1; } } + } else if(outputsig[0] == "8s" || outputsig[0] == "8u") { + for(int j=0; j #include -#include float uniform(void); void random_floats(float *buf, unsigned n); @@ -12,6 +11,7 @@ bool run_volk_tests(const int[], void(*)(), std::string, float, int, int); #define VOLK_RUN_TESTS(func, tol, len, iter) BOOST_CHECK_EQUAL(run_volk_tests(func##_arch_defs, (void (*)())func##_manual, std::string(#func), tol, len, iter), 0) +typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*); typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*); typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*); diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am index 066050a7c..43f38543c 100644 --- a/volk/orc/Makefile.am +++ b/volk/orc/Makefile.am @@ -25,27 +25,27 @@ lib_LTLIBRARIES = libvolk_orc.la libvolk_orc_la_LDFLAGS = $(ORC_LDFLAGS) libvolk_orc_la_SOURCES = \ -volk_8s_convert_16s_aligned16_orc_impl.orc \ -volk_8s_convert_32f_aligned16_orc_impl.orc \ -volk_16u_byteswap_aligned16_orc_impl.orc \ -volk_32s_and_aligned16_orc_impl.orc \ -volk_32s_or_aligned16_orc_impl.orc \ -volk_32f_add_aligned16_orc_impl.orc \ -volk_32f_subtract_aligned16_orc_impl.orc \ -volk_32f_divide_aligned16_orc_impl.orc \ -volk_32f_multiply_aligned16_orc_impl.orc \ -volk_32fc_multiply_aligned16_orc_impl.orc \ -volk_32fc_32f_multiply_aligned16_orc_impl.orc \ -volk_32f_sqrt_aligned16_orc_impl.orc \ -volk_32f_max_aligned16_orc_impl.orc \ -volk_32f_min_aligned16_orc_impl.orc \ -volk_32f_normalize_aligned16_orc_impl.orc \ -volk_32fc_magnitude_32f_aligned16_orc_impl.orc \ -volk_32fc_magnitude_16s_aligned16_orc_impl.orc \ -volk_16sc_magnitude_16s_aligned16_orc_impl.orc \ -volk_16sc_deinterleave_16s_aligned16_orc_impl.orc \ -volk_16sc_deinterleave_32f_aligned16_orc_impl.orc \ -volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc +volk_8s_convert_16s_a16_orc_impl.orc \ +volk_8s_s32f_convert_32f_a16_orc_impl.orc \ +volk_16u_byteswap_a16_orc_impl.orc \ +volk_32s_32s_and_32s_a16_orc_impl.orc \ +volk_32s_32s_or_32s_a16_orc_impl.orc \ +volk_32f_32f_add_32f_a16_orc_impl.orc \ +volk_32f_32f_subtract_32f_a16_orc_impl.orc \ +volk_32f_32f_divide_32f_a16_orc_impl.orc \ +volk_32f_32f_multiply_32f_a16_orc_impl.orc \ +volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc \ +volk_32fc_32f_multiply_32fc_a16_orc_impl.orc \ +volk_32f_sqrt_32f_a16_orc_impl.orc \ +volk_32f_32f_max_32f_a16_orc_impl.orc \ +volk_32f_32f_min_32f_a16_orc_impl.orc \ +volk_32f_s32f_normalize_a16_orc_impl.orc \ +volk_32fc_magnitude_32f_a16_orc_impl.orc \ +volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc \ +volk_16sc_magnitude_16s_a16_orc_impl.orc \ +volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc \ +volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc \ +volk_16sc_deinterleave_real_8s_a16_orc_impl.orc diff --git a/volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc new file mode 100644 index 000000000..d396a0052 --- /dev/null +++ b/volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_16sc_deinterleave_16s_16s_a16_orc_impl +.dest 2 idst +.dest 2 qdst +.source 4 src +splitlw qdst, idst, src diff --git a/volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc deleted file mode 100644 index d226064a7..000000000 --- a/volk/orc/volk_16sc_deinterleave_16s_aligned16_orc_impl.orc +++ /dev/null @@ -1,5 +0,0 @@ -.function volk_16sc_deinterleave_16s_aligned16_orc_impl -.dest 2 idst -.dest 2 qdst -.source 4 src -splitlw qdst, idst, src diff --git a/volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc deleted file mode 100644 index dddf682ca..000000000 --- a/volk/orc/volk_16sc_deinterleave_32f_aligned16_orc_impl.orc +++ /dev/null @@ -1,12 +0,0 @@ -.function volk_16sc_deinterleave_32f_aligned16_orc_impl -.dest 4 idst -.dest 4 qdst -.source 4 src -.floatparam 4 scalar -.temp 8 iql -.temp 8 iqf - -x2 convswl iql, src -x2 convlf iqf, iql -x2 divf iqf, iqf, scalar -splitql qdst, idst, iqf diff --git a/volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc new file mode 100644 index 000000000..5954c406f --- /dev/null +++ b/volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc @@ -0,0 +1,6 @@ +.function volk_16sc_deinterleave_real_8s_a16_orc_impl +.dest 1 dst +.source 4 src +.temp 2 iw +select0lw iw, src +convhwb dst, iw diff --git a/volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc deleted file mode 100644 index 609750096..000000000 --- a/volk/orc/volk_16sc_deinterleave_real_8s_aligned16_orc_impl.orc +++ /dev/null @@ -1,6 +0,0 @@ -.function volk_16sc_deinterleave_real_8s_aligned16_orc_impl -.dest 1 dst -.source 4 src -.temp 2 iw -select0lw iw, src -convhwb dst, iw diff --git a/volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc new file mode 100644 index 000000000..2a49d4ecb --- /dev/null +++ b/volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc @@ -0,0 +1,23 @@ +.function volk_16sc_magnitude_16s_a16_orc_impl +.source 4 src +.dest 2 dst +.floatparam 4 scalar +.temp 8 iql +.temp 8 iqf +.temp 8 prodiqf +.temp 4 qf +.temp 4 if +.temp 4 sumf +.temp 4 rootf +.temp 4 rootl + +x2 convswl iql, src +x2 convlf iqf, iql +x2 divf iqf, iqf, scalar +x2 mulf prodiqf, iqf, iqf +splitql qf, if, prodiqf +addf sumf, if, qf +sqrtf rootf, sumf +mulf rootf, rootf, scalar +convfl rootl, rootf +convlw dst, rootl diff --git a/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc deleted file mode 100644 index 088f56312..000000000 --- a/volk/orc/volk_16sc_magnitude_16s_aligned16_orc_impl.orc +++ /dev/null @@ -1,23 +0,0 @@ -.function volk_16sc_magnitude_16s_aligned16_orc_impl -.source 4 src -.dest 2 dst -.floatparam 4 scalar -.temp 8 iql -.temp 8 iqf -.temp 8 prodiqf -.temp 4 qf -.temp 4 if -.temp 4 sumf -.temp 4 rootf -.temp 4 rootl - -x2 convswl iql, src -x2 convlf iqf, iql -x2 divf iqf, iqf, scalar -x2 mulf prodiqf, iqf, iqf -splitql qf, if, prodiqf -addf sumf, if, qf -sqrtf rootf, sumf -mulf rootf, rootf, scalar -convfl rootl, rootf -convlw dst, rootl diff --git a/volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc b/volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc new file mode 100644 index 000000000..47c3d28a9 --- /dev/null +++ b/volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc @@ -0,0 +1,12 @@ +.function volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl +.dest 4 idst +.dest 4 qdst +.source 4 src +.floatparam 4 scalar +.temp 8 iql +.temp 8 iqf + +x2 convswl iql, src +x2 convlf iqf, iql +x2 divf iqf, iqf, scalar +splitql qdst, idst, iqf diff --git a/volk/orc/volk_16u_byteswap_a16_orc_impl.orc b/volk/orc/volk_16u_byteswap_a16_orc_impl.orc new file mode 100644 index 000000000..c1c8ee59e --- /dev/null +++ b/volk/orc/volk_16u_byteswap_a16_orc_impl.orc @@ -0,0 +1,3 @@ +.function volk_16u_byteswap_a16_orc_impl +.dest 2 dst +swapw dst, dst diff --git a/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc b/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc deleted file mode 100644 index 3ffd12ec0..000000000 --- a/volk/orc/volk_16u_byteswap_aligned16_orc_impl.orc +++ /dev/null @@ -1,3 +0,0 @@ -.function volk_16u_byteswap_aligned16_orc_impl -.dest 2 dst -swapw dst, dst diff --git a/volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc new file mode 100644 index 000000000..e6a30cf01 --- /dev/null +++ b/volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32f_32f_add_32f_a16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +addf dst, src1, src2 diff --git a/volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc new file mode 100644 index 000000000..0bdcd0010 --- /dev/null +++ b/volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32f_32f_divide_32f_a16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +divf dst, src1, src2 diff --git a/volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc new file mode 100644 index 000000000..9584e6634 --- /dev/null +++ b/volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32f_32f_max_32f_a16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +maxf dst, src1, src2 diff --git a/volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc new file mode 100644 index 000000000..47b9c05db --- /dev/null +++ b/volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32f_32f_min_32f_a16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +minf dst, src1, src2 diff --git a/volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc new file mode 100644 index 000000000..e5a049c16 --- /dev/null +++ b/volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32f_32f_multiply_32f_a16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +mulf dst, src1, src2 diff --git a/volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc new file mode 100644 index 000000000..2ab42d5f6 --- /dev/null +++ b/volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32f_32f_subtract_32f_a16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +subf dst, src1, src2 diff --git a/volk/orc/volk_32f_add_aligned16_orc_impl.orc b/volk/orc/volk_32f_add_aligned16_orc_impl.orc deleted file mode 100644 index 20e000f68..000000000 --- a/volk/orc/volk_32f_add_aligned16_orc_impl.orc +++ /dev/null @@ -1,5 +0,0 @@ -.function volk_32f_add_aligned16_orc_impl -.dest 4 dst -.source 4 src1 -.source 4 src2 -addf dst, src1, src2 diff --git a/volk/orc/volk_32f_divide_aligned16_orc_impl.orc b/volk/orc/volk_32f_divide_aligned16_orc_impl.orc deleted file mode 100644 index 870843f2a..000000000 --- a/volk/orc/volk_32f_divide_aligned16_orc_impl.orc +++ /dev/null @@ -1,5 +0,0 @@ -.function volk_32f_divide_aligned16_orc_impl -.dest 4 dst -.source 4 src1 -.source 4 src2 -divf dst, src1, src2 diff --git a/volk/orc/volk_32f_max_aligned16_orc_impl.orc b/volk/orc/volk_32f_max_aligned16_orc_impl.orc deleted file mode 100644 index 97f48ba4a..000000000 --- a/volk/orc/volk_32f_max_aligned16_orc_impl.orc +++ /dev/null @@ -1,5 +0,0 @@ -.function volk_32f_max_aligned16_orc_impl -.dest 4 dst -.source 4 src1 -.source 4 src2 -maxf dst, src1, src2 diff --git a/volk/orc/volk_32f_min_aligned16_orc_impl.orc b/volk/orc/volk_32f_min_aligned16_orc_impl.orc deleted file mode 100644 index a597933de..000000000 --- a/volk/orc/volk_32f_min_aligned16_orc_impl.orc +++ /dev/null @@ -1,5 +0,0 @@ -.function volk_32f_min_aligned16_orc_impl -.dest 4 dst -.source 4 src1 -.source 4 src2 -minf dst, src1, src2 diff --git a/volk/orc/volk_32f_multiply_aligned16_orc_impl.orc b/volk/orc/volk_32f_multiply_aligned16_orc_impl.orc deleted file mode 100644 index 23619af4e..000000000 --- a/volk/orc/volk_32f_multiply_aligned16_orc_impl.orc +++ /dev/null @@ -1,5 +0,0 @@ -.function volk_32f_multiply_aligned16_orc_impl -.dest 4 dst -.source 4 src1 -.source 4 src2 -mulf dst, src1, src2 diff --git a/volk/orc/volk_32f_normalize_aligned16_orc_impl.orc b/volk/orc/volk_32f_normalize_aligned16_orc_impl.orc deleted file mode 100644 index 84d965e7f..000000000 --- a/volk/orc/volk_32f_normalize_aligned16_orc_impl.orc +++ /dev/null @@ -1,5 +0,0 @@ -.function volk_32f_normalize_aligned16_orc_impl -.source 4 src1 -.floatparam 4 invscalar -.dest 4 dst -mulf dst, src1, invscalar diff --git a/volk/orc/volk_32f_s32f_normalize_a16_orc_impl.orc b/volk/orc/volk_32f_s32f_normalize_a16_orc_impl.orc new file mode 100644 index 000000000..acd319b16 --- /dev/null +++ b/volk/orc/volk_32f_s32f_normalize_a16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32f_s32f_normalize_a16_orc_impl +.source 4 src1 +.floatparam 4 invscalar +.dest 4 dst +mulf dst, src1, invscalar diff --git a/volk/orc/volk_32f_sqrt_32f_a16_orc_impl.orc b/volk/orc/volk_32f_sqrt_32f_a16_orc_impl.orc new file mode 100644 index 000000000..ae5680f15 --- /dev/null +++ b/volk/orc/volk_32f_sqrt_32f_a16_orc_impl.orc @@ -0,0 +1,4 @@ +.function volk_32f_sqrt_32f_a16_orc_impl +.source 4 src +.dest 4 dst +sqrtf dst, src diff --git a/volk/orc/volk_32f_sqrt_aligned16_orc_impl.orc b/volk/orc/volk_32f_sqrt_aligned16_orc_impl.orc deleted file mode 100644 index 0983271db..000000000 --- a/volk/orc/volk_32f_sqrt_aligned16_orc_impl.orc +++ /dev/null @@ -1,4 +0,0 @@ -.function volk_32f_sqrt_aligned16_orc_impl -.source 4 src -.dest 4 dst -sqrtf dst, src diff --git a/volk/orc/volk_32f_subtract_aligned16_orc_impl.orc b/volk/orc/volk_32f_subtract_aligned16_orc_impl.orc deleted file mode 100644 index 17dbcad46..000000000 --- a/volk/orc/volk_32f_subtract_aligned16_orc_impl.orc +++ /dev/null @@ -1,5 +0,0 @@ -.function volk_32f_subtract_aligned16_orc_impl -.dest 4 dst -.source 4 src1 -.source 4 src2 -subf dst, src1, src2 diff --git a/volk/orc/volk_32fc_32f_multiply_32fc_a16_orc_impl.orc b/volk/orc/volk_32fc_32f_multiply_32fc_a16_orc_impl.orc new file mode 100644 index 000000000..455293cff --- /dev/null +++ b/volk/orc/volk_32fc_32f_multiply_32fc_a16_orc_impl.orc @@ -0,0 +1,7 @@ +.function volk_32fc_32f_multiply_32fc_a16_orc_impl +.source 8 src1 +.source 4 src2 +.dest 8 dst +.temp 8 tmp +mergelq tmp, src2, src2 +x2 mulf dst, src1, tmp diff --git a/volk/orc/volk_32fc_32f_multiply_aligned16_orc_impl.orc b/volk/orc/volk_32fc_32f_multiply_aligned16_orc_impl.orc deleted file mode 100644 index 27c059852..000000000 --- a/volk/orc/volk_32fc_32f_multiply_aligned16_orc_impl.orc +++ /dev/null @@ -1,7 +0,0 @@ -.function volk_32fc_32f_multiply_aligned16_orc_impl -.source 8 src1 -.source 4 src2 -.dest 8 dst -.temp 8 tmp -mergelq tmp, src2, src2 -x2 mulf dst, src1, tmp diff --git a/volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc b/volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc new file mode 100644 index 000000000..ed928b90f --- /dev/null +++ b/volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc @@ -0,0 +1,6 @@ +.function volk_32fc_32fc_multiply_32fc_a16_orc_impl +.source 8 src1 +.source 8 src2 +.dest 8 dst +.temp 8 tmp +x2 mulf dst, src1, src2 diff --git a/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc deleted file mode 100644 index 15f8fdff0..000000000 --- a/volk/orc/volk_32fc_magnitude_16s_aligned16_orc_impl.orc +++ /dev/null @@ -1,23 +0,0 @@ -.function volk_32fc_magnitude_16s_aligned16_orc_impl -.source 8 src -.dest 2 dst -.floatparam 4 scalar -.temp 8 iqf -.temp 8 prodiqf -.temp 4 qf -.temp 4 if -.temp 4 sumf -.temp 4 rootf -.temp 4 rootl -.temp 4 maskl - -x2 mulf prodiqf, src, src -splitql qf, if, prodiqf -addf sumf, if, qf -sqrtf rootf, sumf -mulf rootf, rootf, scalar -cmpltf maskl, scalar, rootf -andl maskl, maskl, 0x80000000 -orl rootf, rootf, maskl -convfl rootl, rootf -convssslw dst, rootl diff --git a/volk/orc/volk_32fc_magnitude_32f_a16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_32f_a16_orc_impl.orc new file mode 100644 index 000000000..c5e2e57f1 --- /dev/null +++ b/volk/orc/volk_32fc_magnitude_32f_a16_orc_impl.orc @@ -0,0 +1,13 @@ +.function volk_32fc_magnitude_32f_a16_orc_impl +.source 8 src +.dest 4 dst +.temp 8 iqf +.temp 8 prodiqf +.temp 4 qf +.temp 4 if +.temp 4 sumf + +x2 mulf prodiqf, src, src +splitql qf, if, prodiqf +addf sumf, if, qf +sqrtf dst, sumf diff --git a/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc b/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc deleted file mode 100644 index 47a10531d..000000000 --- a/volk/orc/volk_32fc_magnitude_32f_aligned16_orc_impl.orc +++ /dev/null @@ -1,13 +0,0 @@ -.function volk_32fc_magnitude_32f_aligned16_orc_impl -.source 8 src -.dest 4 dst -.temp 8 iqf -.temp 8 prodiqf -.temp 4 qf -.temp 4 if -.temp 4 sumf - -x2 mulf prodiqf, src, src -splitql qf, if, prodiqf -addf sumf, if, qf -sqrtf dst, sumf diff --git a/volk/orc/volk_32fc_multiply_aligned16_orc_impl.orc b/volk/orc/volk_32fc_multiply_aligned16_orc_impl.orc deleted file mode 100644 index ffe9cc3ef..000000000 --- a/volk/orc/volk_32fc_multiply_aligned16_orc_impl.orc +++ /dev/null @@ -1,6 +0,0 @@ -.function volk_32fc_multiply_aligned16_orc_impl -.source 8 src1 -.source 8 src2 -.dest 8 dst -.temp 8 tmp -x2 mulf dst, src1, src2 diff --git a/volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc b/volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc new file mode 100644 index 000000000..cccda8a0f --- /dev/null +++ b/volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc @@ -0,0 +1,23 @@ +.function volk_32fc_s32f_magnitude_16s_a16_orc_impl +.source 8 src +.dest 2 dst +.floatparam 4 scalar +.temp 8 iqf +.temp 8 prodiqf +.temp 4 qf +.temp 4 if +.temp 4 sumf +.temp 4 rootf +.temp 4 rootl +.temp 4 maskl + +x2 mulf prodiqf, src, src +splitql qf, if, prodiqf +addf sumf, if, qf +sqrtf rootf, sumf +mulf rootf, rootf, scalar +cmpltf maskl, scalar, rootf +andl maskl, maskl, 0x80000000 +orl rootf, rootf, maskl +convfl rootl, rootf +convssslw dst, rootl diff --git a/volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc b/volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc new file mode 100644 index 000000000..bff3af875 --- /dev/null +++ b/volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32s_32s_and_32s_a16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +andl dst, src1, src2 diff --git a/volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc b/volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc new file mode 100644 index 000000000..b6961f79e --- /dev/null +++ b/volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_32s_32s_or_32s_a16_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +orl dst, src1, src2 diff --git a/volk/orc/volk_32s_and_aligned16_orc_impl.orc b/volk/orc/volk_32s_and_aligned16_orc_impl.orc deleted file mode 100644 index 9d3c7b733..000000000 --- a/volk/orc/volk_32s_and_aligned16_orc_impl.orc +++ /dev/null @@ -1,5 +0,0 @@ -.function volk_32s_and_aligned16_orc_impl -.dest 4 dst -.source 4 src1 -.source 4 src2 -andl dst, src1, src2 diff --git a/volk/orc/volk_32s_or_aligned16_orc_impl.orc b/volk/orc/volk_32s_or_aligned16_orc_impl.orc deleted file mode 100644 index 6d2a3859a..000000000 --- a/volk/orc/volk_32s_or_aligned16_orc_impl.orc +++ /dev/null @@ -1,5 +0,0 @@ -.function volk_32s_or_aligned16_orc_impl -.dest 4 dst -.source 4 src1 -.source 4 src2 -orl dst, src1, src2 diff --git a/volk/orc/volk_8s_convert_16s_a16_orc_impl.orc b/volk/orc/volk_8s_convert_16s_a16_orc_impl.orc new file mode 100644 index 000000000..a55c7f723 --- /dev/null +++ b/volk/orc/volk_8s_convert_16s_a16_orc_impl.orc @@ -0,0 +1,5 @@ +.function volk_8s_convert_16s_a16_orc_impl +.source 1 src +.dest 2 dst +convsbw dst, src +shlw dst, dst, 8 diff --git a/volk/orc/volk_8s_convert_16s_aligned16_orc_impl.orc b/volk/orc/volk_8s_convert_16s_aligned16_orc_impl.orc deleted file mode 100644 index a089877d1..000000000 --- a/volk/orc/volk_8s_convert_16s_aligned16_orc_impl.orc +++ /dev/null @@ -1,5 +0,0 @@ -.function volk_8s_convert_16s_aligned16_orc_impl -.source 1 src -.dest 2 dst -convsbw dst, src -shlw dst, dst, 8 diff --git a/volk/orc/volk_8s_convert_32f_aligned16_orc_impl.orc b/volk/orc/volk_8s_convert_32f_aligned16_orc_impl.orc deleted file mode 100644 index 91a0084d7..000000000 --- a/volk/orc/volk_8s_convert_32f_aligned16_orc_impl.orc +++ /dev/null @@ -1,9 +0,0 @@ -.function volk_8s_convert_32f_aligned16_orc_impl -.source 2 src -.dest 4 dst -.floatparam 4 scalar -.temp 4 flsrc -.temp 4 lsrc -convswl lsrc, src -convlf flsrc, lsrc -mulf dst, flsrc, scalar diff --git a/volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc b/volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc new file mode 100644 index 000000000..3274ab9d6 --- /dev/null +++ b/volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc @@ -0,0 +1,9 @@ +.function volk_8s_s32f_convert_32f_a16_orc_impl +.source 2 src +.dest 4 dst +.floatparam 4 scalar +.temp 4 flsrc +.temp 4 lsrc +convswl lsrc, src +convlf flsrc, lsrc +mulf dst, flsrc, scalar -- cgit From be1b7d9ffb90aa9c750e6c6793f00dbc8bec486d Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 19 Jan 2011 16:39:28 -0800 Subject: Volk: test suite supports scalar arguments and in-place operations --- volk/lib/qa_utils.cc | 357 +++++++++++++++++++++++++++++++-------------------- volk/lib/qa_utils.h | 15 ++- 2 files changed, 231 insertions(+), 141 deletions(-) (limited to 'volk') diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index a8c00c143..e73b70985 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -7,7 +7,8 @@ #include #include #include -//#include +#include +#include //#include #include #include @@ -24,44 +25,53 @@ void random_floats (float *buf, unsigned n) buf[i] = uniform (); } -void load_random_data(void *data, std::string sig, unsigned int n) { - if(sig == "32fc") { - random_floats((float *)data, n*2); - } else if(sig == "32f") { +void load_random_data(void *data, volk_type_t type, unsigned int n) { + if(type.is_complex) n *= 2; + if(type.is_float) { + assert(type.size == 4); //TODO: double support random_floats((float *)data, n); - } else if(sig == "32u") { - for(int i=0; i((RAND_MAX/2))) * 32768.0)); - } else if(sig == "16sc") { - for(int i=0; i((RAND_MAX/2))) * 32768.0)); - } else if(sig == "8u") { - for(int i=0; i((RAND_MAX/2)) * 256.0)); - } else if(sig == "8s") { - for(int i=0; i((RAND_MAX/2)) * 128.0)); - } else std::cout << "load_random_data(): Invalid sig: " << sig << std::endl; + } else { + float int_max = pow(2, type.size*8); + if(type.is_signed) int_max /= 2.0; + for(int i=0; i((RAND_MAX/2))) * int_max; + //man i really don't know how to do this in a more clever way, you have to cast down at some point + switch(type.size) { + case 8: + if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand; + else ((uint64_t *)data)[i] = (uint64_t) scaled_rand; + break; + case 4: + if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand; + else ((uint32_t *)data)[i] = (uint32_t) scaled_rand; + break; + case 2: + if(type.is_signed) ((int16_t *)data)[i] = (int16_t) scaled_rand; + else ((uint16_t *)data)[i] = (uint16_t) scaled_rand; + break; + case 1: + if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand; + else ((uint8_t *)data)[i] = (uint8_t) scaled_rand; + break; + default: + throw; //no shenanigans here + } + } + } } -template -t *make_aligned_buffer(unsigned int len) { - t *buf; +void *make_aligned_buffer(unsigned int len, unsigned int size) { + void *buf; int ret; - ret = posix_memalign((void**)&buf, 16, len * sizeof(t)); + ret = posix_memalign((void**)&buf, 16, len * size); assert(ret == 0); return buf; } -void make_buffer_for_signature(std::vector &buffs, std::vector inputsig, unsigned int vlen) { - BOOST_FOREACH(std::string sig, inputsig) { - if (sig=="32fc" || sig=="64f" || sig=="64u") buffs.push_back((void *) make_aligned_buffer(vlen)); - else if(sig=="32f" || sig=="32u" || sig=="32s" || sig=="16sc") buffs.push_back((void *) make_aligned_buffer(vlen)); - else if(sig=="16s" || sig=="16u" || sig=="8sc") buffs.push_back((void *) make_aligned_buffer(vlen)); - else if(sig=="8s" || sig=="8u") buffs.push_back((void *) make_aligned_buffer(vlen)); - else std::cout << "Invalid type: " << sig << std::endl; +void make_buffer_for_signature(std::vector &buffs, std::vector inputsig, unsigned int vlen) { + BOOST_FOREACH(volk_type_t sig, inputsig) { + if(!sig.is_scalar) //we don't make buffers for scalars + buffs.push_back(make_aligned_buffer(vlen, sig.size*(sig.is_complex ? 2 : 1))); } } @@ -109,22 +119,56 @@ static std::vector get_arch_list(const int archs[]) { return archlist; } -static bool is_valid_type(std::string type) { - std::vector valid_types = boost::assign::list_of("64f")("64u")("32fc")("32f") - ("32s")("32u")("16sc")("16s") - ("16u")("8s")("8sc")("8u") - ("s32f")("s16u")("s16s")("s8u") - ("s8s"); +volk_type_t volk_type_from_string(std::string name) { + volk_type_t type; + type.is_float = false; + type.is_scalar = false; + type.is_complex = false; + type.is_signed = false; + type.size = 0; + type.str = name; + + assert(name.size() > 1); - BOOST_FOREACH(std::string this_type, valid_types) { - if(type == this_type) return true; + //is it a scalar? + if(name[0] == 's') { + type.is_scalar = true; + name = name.substr(1, name.size()-1); + } + + //get the data size + int last_size_pos = name.find_last_of("0123456789"); + if(last_size_pos < 0) throw 0; + //will throw if malformed + int size = boost::lexical_cast(name.substr(0, last_size_pos+1)); + + assert(((size % 8) == 0) && (size <= 64) && (size != 0)); + type.size = size/8; //in bytes + + for(int i=last_size_pos+1; i < name.size(); i++) { + switch (name[i]) { + case 'f': + type.is_float = true; + break; + case 'i': + type.is_signed = true; + break; + case 'c': + type.is_complex = true; + break; + case 'u': + type.is_signed = false; + break; + default: + throw; + } } - return false; -} + return type; +} -static void get_function_signature(std::vector &inputsig, - std::vector &outputsig, +static void get_signatures_from_name(std::vector &inputsig, + std::vector &outputsig, std::string name) { boost::char_separator sep("_"); boost::tokenizer > tok(name, sep); @@ -133,25 +177,38 @@ static void get_function_signature(std::vector &inputsig, toked.assign(tok.begin(), tok.end()); assert(toked[0] == "volk"); - - inputsig.push_back(toked[1]); //mandatory - int pos = 2; - bool valid_type = true; - while(valid_type && pos < toked.size()) { - if(is_valid_type(toked[pos])) inputsig.push_back(toked[pos]); - else valid_type = false; - pos++; - } - while(!valid_type && pos < toked.size()) { - if(is_valid_type(toked[pos])) valid_type = true; - else pos++; - } - while(valid_type && pos < toked.size()) { - if(is_valid_type(toked[pos])) outputsig.push_back(toked[pos]); - else valid_type = false; - pos++; + toked.erase(toked.begin()); + + //ok. we're assuming a string in the form + //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment) + + enum { SIDE_INPUT, SIDE_OUTPUT } side = SIDE_INPUT; + std::string fn_name; + volk_type_t type; + BOOST_FOREACH(std::string token, toked) { + try { + type = volk_type_from_string(token); + if(side == SIDE_INPUT) inputsig.push_back(type); + else outputsig.push_back(type); + } catch (...){ + if(token[0] == 'x') { //it's a multiplier + if(side == SIDE_INPUT) assert(inputsig.size() > 0); + else assert(outputsig.size() > 0); + int multiplier = boost::lexical_cast(token.substr(1, token.size()-1)); //will throw if invalid + for(int i=1; i while(iter--) func(outbuff, inbuffs[0], inbuffs[1], inbuffs[2], vlen, arch.c_str()); } +inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, void *buff, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buff, scalar, vlen, arch.c_str()); +} + +inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, void *outbuff, std::vector &inbuffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(outbuff, inbuffs[0], scalar, vlen, arch.c_str()); +} + +template +bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { + for(int i=0; i tol) return 1; + } + return 0; +} + +template +bool icompare(t *in1, t *in2, unsigned int vlen) { + for(int i=0; i arch_list = get_arch_list(archs); - BOOST_FOREACH(std::string arch, arch_list) { - std::cout << "Found an arch: " << arch << std::endl; - } - //now we have to get a function signature by parsing the name - std::vector inputsig, outputsig; - get_function_signature(inputsig, outputsig, name); - - for(int i=0; i inbuffs; - make_buffer_for_signature(inbuffs, inputsig, vlen); + std::vector inputsig, outputsig; + get_signatures_from_name(inputsig, outputsig, name); - //allocate output buffers -- one for each output for each arch - std::vector outbuffs; - BOOST_FOREACH(std::string arch, arch_list) { - make_buffer_for_signature(outbuffs, outputsig, vlen); - } - - //and set the input buffers to something random + std::vector inputsc, outputsc; for(int i=0; i inbuffs, outbuffs; - //so let's see here. if the operation has no output sig, it operates in place, - //and we want the output buffers to be the input buffers; we want to copy the input buffer to allllll the output buffers. - if(outputsig.size() == 0) { - //make a set of output buffers according to the input signature - BOOST_FOREACH(std::string arch, arch_list) { + if(outputsig.size() == 0) { //we're operating in place... + //assert(inputsig.size() == 1); //we only support 0 output 1 input right now... + make_buffer_for_signature(inbuffs, inputsig, vlen); //let's make an input buffer + load_random_data(inbuffs[0], inputsig[0], vlen); //and load it with random data + BOOST_FOREACH(std::string arch, arch_list) { //then copy the same random data to each output buffer make_buffer_for_signature(outbuffs, inputsig, vlen); + memcpy(outbuffs.back(), inbuffs[0], vlen*inputsig[0].size*(inputsig[0].is_complex?2:1)); + } + } else { + make_buffer_for_signature(inbuffs, inputsig, vlen); + BOOST_FOREACH(std::string arch, arch_list) { + make_buffer_for_signature(outbuffs, outputsig, vlen); + } + + //and set the input buffers to something random + for(int i=0; i1 scalars"; break; case 2: - run_cast_test2((volk_fn_2arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); + if(inputsc.size() == 0) { + run_cast_test2((volk_fn_2arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); + } else if(inputsc.size() == 1 && inputsc[0].is_float) { + run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), outbuffs[i], inbuffs, 1000.0, vlen, iter, arch_list[i]); + } else throw "unsupported 2 arg function >1 scalars"; break; case 3: run_cast_test3((volk_fn_3arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); @@ -234,69 +328,52 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, run_cast_test4((volk_fn_4arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); break; default: + throw "no function handler for this signature"; break; } + end = clock(); std::cout << arch_list[i] << " completed in " << (double)(end-start)/(double)CLOCKS_PER_SEC << "s" << std::endl; } - //and now compare each output to the generic output //first we have to know which output is the generic one, they aren't in order... int generic_offset; for(int i=0; i tol) { - std::cout << "Generic: " << ((float *)(outbuffs[generic_offset]))[j] << " " << arch_list[i] << ": " << ((float *)(outbuffs[i]))[j] << std::endl; - return 1; - } - } - } else if(outputsig[0] == "32f") { - for(int j=0; j tol) { - std::cout << "Generic: " << ((float *)(outbuffs[generic_offset]))[j] << " " << arch_list[i] << ": " << ((float *)(outbuffs[i]))[j] << std::endl; - return 1; - } - } - } else if(outputsig[0] == "32u" || outputsig[0] == "32s" || outputsig[0] == "16sc") { - for(int j=0; j #include +struct volk_type_t { + bool is_float; + bool is_scalar; + bool is_signed; + bool is_complex; + int size; + std::string str; +}; + +volk_type_t volk_type_from_string(std::string); + float uniform(void); void random_floats(float *buf, unsigned n); @@ -11,9 +22,11 @@ bool run_volk_tests(const int[], void(*)(), std::string, float, int, int); #define VOLK_RUN_TESTS(func, tol, len, iter) BOOST_CHECK_EQUAL(run_volk_tests(func##_arch_defs, (void (*)())func##_manual, std::string(#func), tol, len, iter), 0) -typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); +typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*); typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*); typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*); +typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input +typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*); #endif //VOLK_QA_UTILS_H -- cgit From e3600f59e76c3dc08aedfd77629b7c5c48df86af Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Thu, 20 Jan 2011 16:30:09 -0800 Subject: volk: renamed all files. added all tests. some test things are still broken. --- volk/include/volk/Makefile.am | 128 +++--- volk/include/volk/make_c.py | 2 +- volk/include/volk/volk_16i_branch_4_state_8_a16.h | 194 +++++++++ volk/include/volk/volk_16i_convert_8i_a16.h | 69 +++ volk/include/volk/volk_16i_convert_8i_u.h | 71 ++++ volk/include/volk/volk_16i_max_star_16i_a16.h | 108 +++++ .../volk/volk_16i_max_star_horizontal_16i_a16.h | 130 ++++++ .../volk/volk_16i_permute_and_scalar_add_a16.h | 139 ++++++ volk/include/volk/volk_16i_s32f_convert_32f_a16.h | 119 ++++++ volk/include/volk/volk_16i_s32f_convert_32f_u.h | 122 ++++++ .../volk/volk_16i_x4_quad_max_star_16i_a16.h | 191 +++++++++ .../include/volk/volk_16i_x5_add_quad_16i_x4_a16.h | 136 ++++++ .../volk/volk_16ic_deinterleave_16i_x2_a16.h | 158 +++++++ .../volk/volk_16ic_deinterleave_real_16i_a16.h | 120 ++++++ .../volk/volk_16ic_deinterleave_real_8i_a16.h | 94 +++++ volk/include/volk/volk_16ic_magnitude_16i_a16.h | 190 +++++++++ .../volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h | 108 +++++ .../volk_16ic_s32f_deinterleave_real_32f_a16.h | 125 ++++++ .../volk/volk_16ic_s32f_magnitude_32f_a16.h | 179 ++++++++ volk/include/volk/volk_16s_add_quad_a16.h | 136 ------ volk/include/volk/volk_16s_branch_4_state_8_a16.h | 194 --------- volk/include/volk/volk_16s_convert_8s_a16.h | 69 --- volk/include/volk/volk_16s_convert_8s_ua16.h | 71 ---- volk/include/volk/volk_16s_max_star_16s_a16.h | 108 ----- .../volk/volk_16s_max_star_horizontal_16s_a16.h | 130 ------ .../volk/volk_16s_permute_and_scalar_add_a16.h | 139 ------ volk/include/volk/volk_16s_quad_max_star_16s_a16.h | 191 --------- volk/include/volk/volk_16s_s32f_convert_32f_a16.h | 119 ------ volk/include/volk/volk_16s_s32f_convert_32f_ua16.h | 122 ------ .../volk/volk_16sc_deinterleave_16s_16s_a16.h | 158 ------- .../volk/volk_16sc_deinterleave_real_16s_a16.h | 120 ------ .../volk/volk_16sc_deinterleave_real_8s_a16.h | 94 ----- volk/include/volk/volk_16sc_magnitude_16s_a16.h | 190 --------- .../volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h | 108 ----- .../volk_16sc_s32f_deinterleave_real_32f_a16.h | 125 ------ .../volk/volk_16sc_s32f_magnitude_32f_a16.h | 179 -------- .../volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h | 151 ------- volk/include/volk/volk_32f_32f_add_32f_a16.h | 81 ---- volk/include/volk/volk_32f_32f_divide_32f_a16.h | 82 ---- volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h | 184 -------- volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h | 184 -------- .../volk/volk_32f_32f_interleave_32fc_a16.h | 75 ---- volk/include/volk/volk_32f_32f_max_32f_a16.h | 85 ---- volk/include/volk/volk_32f_32f_min_32f_a16.h | 85 ---- volk/include/volk/volk_32f_32f_multiply_32f_a16.h | 81 ---- .../volk/volk_32f_32f_s32f_interleave_16sc_a16.h | 155 ------- volk/include/volk/volk_32f_32f_subtract_32f_a16.h | 81 ---- volk/include/volk/volk_32f_convert_64f_u.h | 70 +++ volk/include/volk/volk_32f_convert_64f_ua16.h | 70 --- volk/include/volk/volk_32f_s32f_convert_16i_a16.h | 110 +++++ volk/include/volk/volk_32f_s32f_convert_16i_u.h | 113 +++++ volk/include/volk/volk_32f_s32f_convert_16s_a16.h | 110 ----- volk/include/volk/volk_32f_s32f_convert_16s_ua16.h | 113 ----- volk/include/volk/volk_32f_s32f_convert_32i_a16.h | 106 +++++ volk/include/volk/volk_32f_s32f_convert_32i_u.h | 109 +++++ volk/include/volk/volk_32f_s32f_convert_32s_a16.h | 106 ----- volk/include/volk/volk_32f_s32f_convert_32s_ua16.h | 109 ----- volk/include/volk/volk_32f_s32f_convert_8i_a16.h | 117 ++++++ volk/include/volk/volk_32f_s32f_convert_8i_u.h | 120 ++++++ volk/include/volk/volk_32f_s32f_convert_8s_a16.h | 117 ------ volk/include/volk/volk_32f_s32f_convert_8s_ua16.h | 120 ------ .../volk/volk_32f_stddev_and_mean_32f_32f_a16.h | 169 -------- .../volk/volk_32f_stddev_and_mean_32f_x2_a16.h | 169 ++++++++ volk/include/volk/volk_32f_x2_add_32f_a16.h | 81 ++++ volk/include/volk/volk_32f_x2_divide_32f_a16.h | 82 ++++ volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h | 184 ++++++++ volk/include/volk/volk_32f_x2_dot_prod_32f_u.h | 184 ++++++++ .../include/volk/volk_32f_x2_interleave_32fc_a16.h | 75 ++++ volk/include/volk/volk_32f_x2_max_32f_a16.h | 85 ++++ volk/include/volk/volk_32f_x2_min_32f_a16.h | 85 ++++ volk/include/volk/volk_32f_x2_multiply_32f_a16.h | 81 ++++ .../volk/volk_32f_x2_s32f_interleave_16ic_a16.h | 155 +++++++ volk/include/volk/volk_32f_x2_subtract_32f_a16.h | 81 ++++ .../include/volk/volk_32f_x3_sum_of_poly_32f_a16.h | 151 +++++++ .../volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h | 344 --------------- .../volk/volk_32fc_32fc_dot_prod_32fc_a16.h | 468 --------------------- .../volk/volk_32fc_32fc_multiply_32fc_a16.h | 95 ----- ...2fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h | 126 ------ .../volk/volk_32fc_32fc_square_dist_32f_a16.h | 112 ----- .../volk/volk_32fc_deinterleave_32f_32f_a16.h | 75 ---- .../volk/volk_32fc_deinterleave_32f_x2_a16.h | 75 ++++ .../volk/volk_32fc_deinterleave_64f_64f_a16.h | 78 ---- .../volk/volk_32fc_deinterleave_64f_x2_a16.h | 78 ++++ .../volk/volk_32fc_deinterleave_real_16i_a16.h | 80 ++++ .../volk/volk_32fc_deinterleave_real_16s_a16.h | 80 ---- .../volk/volk_32fc_s32f_magnitude_16i_a16.h | 158 +++++++ .../volk/volk_32fc_s32f_magnitude_16s_a16.h | 158 ------- ...32fc_s32f_s32f_power_spectral_density_32f_a16.h | 134 ------ ...k_32fc_s32f_x2_power_spectral_density_32f_a16.h | 134 ++++++ .../volk_32fc_x2_conjugate_dot_prod_32fc_a16.h | 344 +++++++++++++++ volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h | 468 +++++++++++++++++++++ volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h | 95 +++++ ..._32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h | 126 ++++++ .../volk/volk_32fc_x2_square_dist_32f_a16.h | 112 +++++ volk/include/volk/volk_32i_s32f_convert_32f_a16.h | 73 ++++ volk/include/volk/volk_32i_s32f_convert_32f_u.h | 75 ++++ volk/include/volk/volk_32i_x2_and_32i_a16.h | 81 ++++ volk/include/volk/volk_32i_x2_or_32i_a16.h | 81 ++++ volk/include/volk/volk_32s_32s_and_32s_a16.h | 81 ---- volk/include/volk/volk_32s_32s_or_32s_a16.h | 81 ---- volk/include/volk/volk_32s_s32f_convert_32f_a16.h | 73 ---- volk/include/volk/volk_32s_s32f_convert_32f_ua16.h | 75 ---- volk/include/volk/volk_64f_64f_max_64f_a16.h | 71 ---- volk/include/volk/volk_64f_64f_min_64f_a16.h | 71 ---- volk/include/volk/volk_64f_convert_32f_u.h | 67 +++ volk/include/volk/volk_64f_convert_32f_ua16.h | 67 --- volk/include/volk/volk_64f_x2_max_64f_a16.h | 71 ++++ volk/include/volk/volk_64f_x2_min_64f_a16.h | 71 ++++ volk/include/volk/volk_8i_convert_16i_a16.h | 83 ++++ volk/include/volk/volk_8i_convert_16i_u.h | 73 ++++ volk/include/volk/volk_8i_s32f_convert_32f_a16.h | 105 +++++ volk/include/volk/volk_8i_s32f_convert_32f_u.h | 94 +++++ .../volk/volk_8ic_deinterleave_16i_x2_a16.h | 77 ++++ .../volk/volk_8ic_deinterleave_real_16i_a16.h | 66 +++ .../volk/volk_8ic_deinterleave_real_8i_a16.h | 67 +++ .../volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h | 164 ++++++++ .../volk/volk_8ic_s32f_deinterleave_real_32f_a16.h | 133 ++++++ .../volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h | 102 +++++ .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h | 122 ++++++ volk/include/volk/volk_8s_convert_16s_a16.h | 83 ---- volk/include/volk/volk_8s_convert_16s_ua16.h | 73 ---- volk/include/volk/volk_8s_s32f_convert_32f_a16.h | 105 ----- volk/include/volk/volk_8s_s32f_convert_32f_ua16.h | 94 ----- .../volk_8sc_8sc_multiply_conjugate_16sc_a16.h | 102 ----- ...volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h | 122 ------ .../volk/volk_8sc_deinterleave_16s_16s_a16.h | 77 ---- .../volk/volk_8sc_deinterleave_real_16s_a16.h | 66 --- .../volk/volk_8sc_deinterleave_real_8s_a16.h | 67 --- .../volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h | 164 -------- .../volk/volk_8sc_s32f_deinterleave_real_32f_a16.h | 133 ------ volk/include/volk/volk_register.py | 4 +- volk/lib/qa_utils.cc | 101 +++-- volk/lib/qa_utils.h | 1 + volk/orc/Makefile.am | 33 +- ...k_16i_s32f_deinterleave_32f_x2_a16_orc_impl.orc | 12 + .../volk_16ic_deinterleave_16i_x2_a16_orc_impl.orc | 5 + ...volk_16ic_deinterleave_real_8i_a16_orc_impl.orc | 6 + volk/orc/volk_16ic_magnitude_16i_a16_orc_impl.orc | 23 + ...volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc | 5 - ...volk_16sc_deinterleave_real_8s_a16_orc_impl.orc | 6 - volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc | 23 - .../volk_16sc_magnitude_32f_aligned16_orc_impl.orc | 2 +- ...16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc | 12 - volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc | 5 - volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc | 5 - volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc | 5 - volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc | 5 - .../orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc | 5 - .../orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc | 5 - volk/orc/volk_32f_x2_add_32f_a16_orc_impl.orc | 5 + volk/orc/volk_32f_x2_divide_32f_a16_orc_impl.orc | 5 + volk/orc/volk_32f_x2_max_32f_a16_orc_impl.orc | 5 + volk/orc/volk_32f_x2_min_32f_a16_orc_impl.orc | 5 + volk/orc/volk_32f_x2_multiply_32f_a16_orc_impl.orc | 5 + volk/orc/volk_32f_x2_subtract_32f_a16_orc_impl.orc | 5 + .../volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc | 6 - .../volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc | 23 + .../volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc | 23 - .../volk_32fc_x2_multiply_32fc_a16_orc_impl.orc | 6 + volk/orc/volk_32i_x2_and_32i_a16_orc_impl.orc | 5 + volk/orc/volk_32i_x2_or_32i_a16_orc_impl.orc | 5 + volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc | 5 - volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc | 5 - volk/orc/volk_8i_convert_16i_a16_orc_impl.orc | 5 + volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc | 9 + volk/orc/volk_8s_convert_16s_a16_orc_impl.orc | 5 - volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc | 9 - 167 files changed, 7962 insertions(+), 7939 deletions(-) create mode 100644 volk/include/volk/volk_16i_branch_4_state_8_a16.h create mode 100644 volk/include/volk/volk_16i_convert_8i_a16.h create mode 100644 volk/include/volk/volk_16i_convert_8i_u.h create mode 100644 volk/include/volk/volk_16i_max_star_16i_a16.h create mode 100644 volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h create mode 100644 volk/include/volk/volk_16i_permute_and_scalar_add_a16.h create mode 100644 volk/include/volk/volk_16i_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_16i_s32f_convert_32f_u.h create mode 100644 volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h create mode 100644 volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h create mode 100644 volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h create mode 100644 volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h create mode 100644 volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h create mode 100644 volk/include/volk/volk_16ic_magnitude_16i_a16.h create mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h create mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h create mode 100644 volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h delete mode 100644 volk/include/volk/volk_16s_add_quad_a16.h delete mode 100644 volk/include/volk/volk_16s_branch_4_state_8_a16.h delete mode 100644 volk/include/volk/volk_16s_convert_8s_a16.h delete mode 100644 volk/include/volk/volk_16s_convert_8s_ua16.h delete mode 100644 volk/include/volk/volk_16s_max_star_16s_a16.h delete mode 100644 volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h delete mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_a16.h delete mode 100644 volk/include/volk/volk_16s_quad_max_star_16s_a16.h delete mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_a16.h delete mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_ua16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h delete mode 100644 volk/include/volk/volk_16sc_magnitude_16s_a16.h delete mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h delete mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h delete mode 100644 volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_add_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_divide_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h delete mode 100644 volk/include/volk/volk_32f_32f_interleave_32fc_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_max_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_min_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_multiply_32f_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h delete mode 100644 volk/include/volk/volk_32f_32f_subtract_32f_a16.h create mode 100644 volk/include/volk/volk_32f_convert_64f_u.h delete mode 100644 volk/include/volk/volk_32f_convert_64f_ua16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_16i_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_16i_u.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_a16.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_ua16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_32i_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_32i_u.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_a16.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_ua16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_8i_a16.h create mode 100644 volk/include/volk/volk_32f_s32f_convert_8i_u.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_a16.h delete mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_ua16.h delete mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h create mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h create mode 100644 volk/include/volk/volk_32f_x2_add_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_divide_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_dot_prod_32f_u.h create mode 100644 volk/include/volk/volk_32f_x2_interleave_32fc_a16.h create mode 100644 volk/include/volk/volk_32f_x2_max_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_min_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_multiply_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h create mode 100644 volk/include/volk/volk_32f_x2_subtract_32f_a16.h create mode 100644 volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h delete mode 100644 volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h delete mode 100644 volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h delete mode 100644 volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h delete mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h delete mode 100644 volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h create mode 100644 volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h create mode 100644 volk/include/volk/volk_32i_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_32i_s32f_convert_32f_u.h create mode 100644 volk/include/volk/volk_32i_x2_and_32i_a16.h create mode 100644 volk/include/volk/volk_32i_x2_or_32i_a16.h delete mode 100644 volk/include/volk/volk_32s_32s_and_32s_a16.h delete mode 100644 volk/include/volk/volk_32s_32s_or_32s_a16.h delete mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_a16.h delete mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_ua16.h delete mode 100644 volk/include/volk/volk_64f_64f_max_64f_a16.h delete mode 100644 volk/include/volk/volk_64f_64f_min_64f_a16.h create mode 100644 volk/include/volk/volk_64f_convert_32f_u.h delete mode 100644 volk/include/volk/volk_64f_convert_32f_ua16.h create mode 100644 volk/include/volk/volk_64f_x2_max_64f_a16.h create mode 100644 volk/include/volk/volk_64f_x2_min_64f_a16.h create mode 100644 volk/include/volk/volk_8i_convert_16i_a16.h create mode 100644 volk/include/volk/volk_8i_convert_16i_u.h create mode 100644 volk/include/volk/volk_8i_s32f_convert_32f_a16.h create mode 100644 volk/include/volk/volk_8i_s32f_convert_32f_u.h create mode 100644 volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h create mode 100644 volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h create mode 100644 volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h create mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h create mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h create mode 100644 volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h create mode 100644 volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h delete mode 100644 volk/include/volk/volk_8s_convert_16s_a16.h delete mode 100644 volk/include/volk/volk_8s_convert_16s_ua16.h delete mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_a16.h delete mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_ua16.h delete mode 100644 volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h delete mode 100644 volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h delete mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h delete mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h create mode 100644 volk/orc/volk_16i_s32f_deinterleave_32f_x2_a16_orc_impl.orc create mode 100644 volk/orc/volk_16ic_deinterleave_16i_x2_a16_orc_impl.orc create mode 100644 volk/orc/volk_16ic_deinterleave_real_8i_a16_orc_impl.orc create mode 100644 volk/orc/volk_16ic_magnitude_16i_a16_orc_impl.orc delete mode 100644 volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc delete mode 100644 volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc delete mode 100644 volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc delete mode 100644 volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc create mode 100644 volk/orc/volk_32f_x2_add_32f_a16_orc_impl.orc create mode 100644 volk/orc/volk_32f_x2_divide_32f_a16_orc_impl.orc create mode 100644 volk/orc/volk_32f_x2_max_32f_a16_orc_impl.orc create mode 100644 volk/orc/volk_32f_x2_min_32f_a16_orc_impl.orc create mode 100644 volk/orc/volk_32f_x2_multiply_32f_a16_orc_impl.orc create mode 100644 volk/orc/volk_32f_x2_subtract_32f_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc create mode 100644 volk/orc/volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc create mode 100644 volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc create mode 100644 volk/orc/volk_32i_x2_and_32i_a16_orc_impl.orc create mode 100644 volk/orc/volk_32i_x2_or_32i_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc delete mode 100644 volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc create mode 100644 volk/orc/volk_8i_convert_16i_a16_orc_impl.orc create mode 100644 volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc delete mode 100644 volk/orc/volk_8s_convert_16s_a16_orc_impl.orc delete mode 100644 volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc (limited to 'volk') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index aef1d7ba8..43c8ae9df 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -41,93 +41,93 @@ volkinclude_HEADERS = \ volk.h \ volk_cpu.h \ volk_environment_init.h \ - volk_16s_add_quad_a16.h \ - volk_16s_branch_4_state_8_a16.h \ - volk_16sc_deinterleave_16s_16s_a16.h \ - volk_16sc_s32f_deinterleave_32f_32f_a16.h \ - volk_16sc_deinterleave_real_16s_a16.h \ - volk_16sc_s32f_deinterleave_real_32f_a16.h \ - volk_16sc_deinterleave_real_8s_a16.h \ - volk_16sc_magnitude_16s_a16.h \ - volk_16sc_s32f_magnitude_32f_a16.h \ - volk_16s_s32f_convert_32f_a16.h \ - volk_16s_s32f_convert_32f_ua16.h \ - volk_16s_convert_8s_a16.h \ - volk_16s_convert_8s_ua16.h \ - volk_16s_max_star_16s_a16.h \ - volk_16s_max_star_horizontal_16s_a16.h \ - volk_16s_permute_and_scalar_add_a16.h \ - volk_16s_quad_max_star_16s_a16.h \ + volk_16i_x5_add_quad_16i_x4_a16.h \ + volk_16i_branch_4_state_8_a16.h \ + volk_16ic_deinterleave_16i_x2_a16.h \ + volk_16ic_s32f_deinterleave_32f_x2_a16.h \ + volk_16ic_deinterleave_real_16i_a16.h \ + volk_16ic_s32f_deinterleave_real_32f_a16.h \ + volk_16ic_deinterleave_real_8i_a16.h \ + volk_16ic_magnitude_16i_a16.h \ + volk_16ic_s32f_magnitude_32f_a16.h \ + volk_16i_s32f_convert_32f_a16.h \ + volk_16i_s32f_convert_32f_u.h \ + volk_16i_convert_8i_a16.h \ + volk_16i_convert_8i_u.h \ + volk_16i_max_star_16i_a16.h \ + volk_16i_max_star_horizontal_16i_a16.h \ + volk_16i_permute_and_scalar_add_a16.h \ + volk_16i_x4_quad_max_star_16i_a16.h \ volk_16u_byteswap_a16.h \ volk_32f_accumulator_s32f_a16.h \ - volk_32f_32f_add_32f_a16.h \ + volk_32f_x2_add_32f_a16.h \ volk_32fc_32f_multiply_32fc_a16.h \ volk_32fc_32f_power_32fc_a16.h \ volk_32f_calc_spectral_noise_floor_a16.h \ volk_32fc_s32f_atan2_32f_a16.h \ - volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h \ - volk_32fc_deinterleave_32f_32f_a16.h \ - volk_32fc_deinterleave_64f_64f_a16.h \ - volk_32fc_deinterleave_real_16s_a16.h \ + volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \ + volk_32fc_deinterleave_32f_x2_a16.h \ + volk_32fc_deinterleave_64f_x2_a16.h \ + volk_32fc_deinterleave_real_16i_a16.h \ volk_32fc_deinterleave_real_32f_a16.h \ volk_32fc_deinterleave_real_64f_a16.h \ - volk_32fc_32fc_dot_prod_32fc_a16.h \ + volk_32fc_x2_dot_prod_32fc_a16.h \ volk_32fc_index_max_16u_a16.h \ - volk_32fc_s32f_magnitude_16s_a16.h \ + volk_32fc_s32f_magnitude_16i_a16.h \ volk_32fc_magnitude_32f_a16.h \ - volk_32fc_32fc_multiply_32fc_a16.h \ - volk_32f_s32f_convert_16s_a16.h \ - volk_32f_s32f_convert_16s_ua16.h \ - volk_32f_s32f_convert_32s_a16.h \ - volk_32f_s32f_convert_32s_ua16.h \ + volk_32fc_x2_multiply_32fc_a16.h \ + volk_32f_s32f_convert_16i_a16.h \ + volk_32f_s32f_convert_16i_u.h \ + volk_32f_s32f_convert_32i_a16.h \ + volk_32f_s32f_convert_32i_u.h \ volk_32f_convert_64f_a16.h \ - volk_32f_convert_64f_ua16.h \ - volk_32f_s32f_convert_8s_a16.h \ - volk_32f_s32f_convert_8s_ua16.h \ - volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h \ + volk_32f_convert_64f_u.h \ + volk_32f_s32f_convert_8i_a16.h \ + volk_32f_s32f_convert_8i_u.h \ + volk_32fc_s32f_x2_power_spectral_density_32f_a16.h \ volk_32fc_s32f_power_spectrum_32f_a16.h \ - volk_32fc_32fc_square_dist_32f_a16.h \ - volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h \ - volk_32f_32f_divide_32f_a16.h \ - volk_32f_32f_dot_prod_32f_a16.h \ - volk_32f_32f_dot_prod_32f_ua16.h \ + volk_32fc_x2_square_dist_32f_a16.h \ + volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h \ + volk_32f_x2_divide_32f_a16.h \ + volk_32f_x2_dot_prod_32f_a16.h \ + volk_32f_x2_dot_prod_32f_u.h \ volk_32f_s32f_32f_fm_detect_32f_a16.h \ volk_32f_index_max_16u_a16.h \ - volk_32f_32f_s32f_interleave_16sc_a16.h \ - volk_32f_32f_interleave_32fc_a16.h \ - volk_32f_32f_max_32f_a16.h \ - volk_32f_32f_min_32f_a16.h \ - volk_32f_32f_multiply_32f_a16.h \ + volk_32f_x2_s32f_interleave_16ic_a16.h \ + volk_32f_x2_interleave_32fc_a16.h \ + volk_32f_x2_max_32f_a16.h \ + volk_32f_x2_min_32f_a16.h \ + volk_32f_x2_multiply_32f_a16.h \ volk_32f_s32f_normalize_a16.h \ volk_32f_s32f_power_32f_a16.h \ volk_32f_sqrt_32f_a16.h \ volk_32f_s32f_stddev_32f_a16.h \ - volk_32f_stddev_and_mean_32f_32f_a16.h \ - volk_32f_32f_subtract_32f_a16.h \ - volk_32f_32f_32f_sum_of_poly_32f_a16.h \ - volk_32s_32s_and_32s_a16.h \ - volk_32s_s32f_convert_32f_a16.h \ - volk_32s_s32f_convert_32f_ua16.h \ - volk_32s_32s_or_32s_a16.h \ + volk_32f_stddev_and_mean_32f_x2_a16.h \ + volk_32f_x2_subtract_32f_a16.h \ + volk_32f_x3_sum_of_poly_32f_a16.h \ + volk_32i_x2_and_32i_a16.h \ + volk_32i_s32f_convert_32f_a16.h \ + volk_32i_s32f_convert_32f_u.h \ + volk_32i_x2_or_32i_a16.h \ volk_32u_byteswap_a16.h \ volk_32u_popcnt_a16.h \ volk_64f_convert_32f_a16.h \ - volk_64f_convert_32f_ua16.h \ - volk_64f_64f_max_64f_a16.h \ - volk_64f_64f_min_64f_a16.h \ + volk_64f_convert_32f_u.h \ + volk_64f_x2_max_64f_a16.h \ + volk_64f_x2_min_64f_a16.h \ volk_64u_byteswap_a16.h \ volk_64u_popcnt_a16.h \ - volk_8sc_deinterleave_16s_16s_a16.h \ - volk_8sc_s32f_deinterleave_32f_32f_a16.h \ - volk_8sc_deinterleave_real_16s_a16.h \ - volk_8sc_s32f_deinterleave_real_32f_a16.h \ - volk_8sc_deinterleave_real_8s_a16.h \ - volk_8sc_8sc_multiply_conjugate_16sc_a16.h \ - volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h \ - volk_8s_convert_16s_a16.h \ - volk_8s_convert_16s_ua16.h \ - volk_8s_s32f_convert_32f_a16.h \ - volk_8s_s32f_convert_32f_ua16.h + volk_8ic_deinterleave_16i_x2_a16.h \ + volk_8ic_s32f_deinterleave_32f_x2_a16.h \ + volk_8ic_deinterleave_real_16i_a16.h \ + volk_8ic_s32f_deinterleave_real_32f_a16.h \ + volk_8ic_deinterleave_real_8i_a16.h \ + volk_8ic_x2_multiply_conjugate_16ic_a16.h \ + volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h \ + volk_8i_convert_16i_a16.h \ + volk_8i_convert_16i_u.h \ + volk_8i_s32f_convert_32f_a16.h \ + volk_8i_s32f_convert_32f_u.h VOLK_MKTABLES_SOURCES = \ $(top_srcdir)/lib/volk_rank_archs.c \ diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py index f708ba7d0..6e75067d0 100644 --- a/volk/include/volk/make_c.py +++ b/volk/include/volk/make_c.py @@ -24,7 +24,7 @@ def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) : tempstring = tempstring + " }\n" tempstring = tempstring + " return 0;\n" tempstring = tempstring + "}\n" - + for i in range(len(funclist)): tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n"; diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a16.h b/volk/include/volk/volk_16i_branch_4_state_8_a16.h new file mode 100644 index 000000000..3437c1a6b --- /dev/null +++ b/volk/include/volk/volk_16i_branch_4_state_8_a16.h @@ -0,0 +1,194 @@ +#ifndef INCLUDED_volk_16i_branch_4_state_8_a16_H +#define INCLUDED_volk_16i_branch_4_state_8_a16_H + + +#include +#include + + + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16i_branch_4_state_8_a16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; + + __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; + + + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = 1; + + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + xmm0 = _mm_load_si128((__m128i*)permuters[0]); + xmm6 = _mm_load_si128((__m128i*)permuters[1]); + xmm8 = _mm_load_si128((__m128i*)permuters[2]); + xmm10 = _mm_load_si128((__m128i*)permuters[3]); + + for(; i < bound; ++i) { + + xmm5 = _mm_load_si128(p_src0); + + + + + + + + + + xmm0 = _mm_shuffle_epi8(xmm5, xmm0); + xmm6 = _mm_shuffle_epi8(xmm5, xmm6); + xmm8 = _mm_shuffle_epi8(xmm5, xmm8); + xmm10 = _mm_shuffle_epi8(xmm5, xmm10); + + p_src0 += 4; + + + xmm5 = _mm_add_epi16(xmm1, xmm2); + + xmm6 = _mm_add_epi16(xmm2, xmm6); + xmm8 = _mm_add_epi16(xmm1, xmm8); + + + xmm7 = _mm_load_si128(p_cntl2); + xmm9 = _mm_load_si128(p_cntl3); + + xmm0 = _mm_add_epi16(xmm5, xmm0); + + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm5 = _mm_load_si128(&p_cntl2[1]); + xmm11 = _mm_load_si128(&p_cntl3[1]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm7); + + + + xmm7 = _mm_load_si128(&p_cntl2[2]); + xmm9 = _mm_load_si128(&p_cntl3[2]); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + xmm7 = _mm_and_si128(xmm7, xmm3); + xmm9 = _mm_and_si128(xmm9, xmm4); + + xmm6 = _mm_add_epi16(xmm6, xmm5); + + + xmm5 = _mm_load_si128(&p_cntl2[3]); + xmm11 = _mm_load_si128(&p_cntl3[3]); + + xmm7 = _mm_add_epi16(xmm7, xmm9); + + xmm5 = _mm_and_si128(xmm5, xmm3); + xmm11 = _mm_and_si128(xmm11, xmm4); + + xmm8 = _mm_add_epi16(xmm8, xmm7); + + xmm5 = _mm_add_epi16(xmm5, xmm11); + + _mm_store_si128(p_target, xmm0); + _mm_store_si128(&p_target[1], xmm6); + + xmm10 = _mm_add_epi16(xmm5, xmm10); + + _mm_store_si128(&p_target[2], xmm8); + + _mm_store_si128(&p_target[3], xmm10); + + p_target += 3; + } +} + + +#endif /*LV_HAVE_SSEs*/ + +#if LV_HAVE_GENERIC +static inline void volk_16i_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { + int i = 0; + + int bound = 4; + + for(; i < bound; ++i) { + target[i* 8] = src0[((char)permuters[i][0])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8] & scalars[2]) + + (cntl3[i * 8] & scalars[3]); + target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 1] & scalars[2]) + + (cntl3[i * 8 + 1] & scalars[3]); + target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 2] & scalars[2]) + + (cntl3[i * 8 + 2] & scalars[3]); + target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 3] & scalars[2]) + + (cntl3[i * 8 + 3] & scalars[3]); + target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 4] & scalars[2]) + + (cntl3[i * 8 + 4] & scalars[3]); + target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 5] & scalars[2]) + + (cntl3[i * 8 + 5] & scalars[3]); + target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 6] & scalars[2]) + + (cntl3[i * 8 + 6] & scalars[3]); + target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] + + ((i + 1)%2 * scalars[0]) + + (((i >> 1)^1) * scalars[1]) + + (cntl2[i * 8 + 7] & scalars[2]) + + (cntl3[i * 8 + 7] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16i_branch_4_state_8_a16_H*/ diff --git a/volk/include/volk/volk_16i_convert_8i_a16.h b/volk/include/volk/volk_16i_convert_8i_a16.h new file mode 100644 index 000000000..73e45ad63 --- /dev/null +++ b/volk/include/volk/volk_16i_convert_8i_a16.h @@ -0,0 +1,69 @@ +#ifndef INCLUDED_volk_16i_convert_8i_a16_H +#define INCLUDED_volk_16i_convert_8i_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16i_convert_8i_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted +*/ +static inline void volk_16i_convert_8i_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_convert_8i_a16_H */ diff --git a/volk/include/volk/volk_16i_convert_8i_u.h b/volk/include/volk/volk_16i_convert_8i_u.h new file mode 100644 index 000000000..5fc792b56 --- /dev/null +++ b/volk/include/volk/volk_16i_convert_8i_u.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_16i_convert_8i_u_H +#define INCLUDED_volk_16i_convert_8i_u_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned +*/ +static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + int8_t* outputVectorPtr = outputVector; + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal1; + __m128i inputVal2; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + + // Load the 16 values + inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; + inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; + + inputVal1 = _mm_srai_epi16(inputVal1, 8); + inputVal2 = _mm_srai_epi16(inputVal2, 8); + + ret = _mm_packs_epi16(inputVal1, inputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, ret); + + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] =(int8_t)(inputVector[number] >> 8); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the input 16 bit integer data into 8 bit integer data + \param inputVector The 16 bit input data buffer + \param outputVector The 8 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned +*/ +static inline void volk_16i_convert_8i_u_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_convert_8i_u_H */ diff --git a/volk/include/volk/volk_16i_max_star_16i_a16.h b/volk/include/volk/volk_16i_max_star_16i_a16.h new file mode 100644 index 000000000..ff57bd2a1 --- /dev/null +++ b/volk/include/volk/volk_16i_max_star_16i_a16.h @@ -0,0 +1,108 @@ +#ifndef INCLUDED_volk_16i_max_star_16i_a16_H +#define INCLUDED_volk_16i_max_star_16i_a16_H + + +#include +#include + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16i_max_star_16i_a16_ssse3(short* target, short* src0, unsigned int num_bytes) { + + + + short candidate = src0[0]; + short cands[8]; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; + + + __m128i *p_src0; + + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + xmm1 = _mm_setzero_si128(); + xmm0 = _mm_setzero_si128(); + //_mm_insert_epi16(xmm0, candidate, 0); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm1); + + + for(i = 0; i < bound; ++i) { + xmm1 = _mm_load_si128(p_src0); + p_src0 += 1; + xmm2 = _mm_sub_epi16(xmm1, xmm0); + + + + + + + xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); + xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); + xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); + + xmm6 = _mm_xor_si128(xmm4, xmm5); + + xmm3 = _mm_and_si128(xmm3, xmm0); + xmm4 = _mm_and_si128(xmm6, xmm1); + + xmm0 = _mm_add_epi16(xmm3, xmm4); + + + } + + _mm_store_si128((__m128i*)cands, xmm0); + + for(i = 0; i < 8; ++i) { + candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; + } + + + + for(i = 0; i < leftovers; ++i) { + + candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; + } + + target[0] = candidate; + + + + + +} + +#endif /*LV_HAVE_SSSE3*/ + +#if LV_HAVE_GENERIC + +static inline void volk_16i_max_star_16i_a16_generic(short* target, short* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short candidate = src0[0]; + for(i = 1; i < bound; ++i) { + candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; + } + target[0] = candidate; + +} + + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16i_max_star_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h new file mode 100644 index 000000000..695e08dbf --- /dev/null +++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h @@ -0,0 +1,130 @@ +#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a16_H +#define INCLUDED_volk_16i_max_star_horizontal_16i_a16_H + + +#include +#include + + +#if LV_HAVE_SSSE3 + +#include +#include +#include + +static inline void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; + const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; + + + + volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i xmm5, xmm6, xmm7, xmm8; + + xmm4 = _mm_load_si128((__m128i*)shufmask0); + xmm5 = _mm_load_si128((__m128i*)shufmask1); + xmm6 = _mm_load_si128((__m128i*)andmask0); + xmm7 = _mm_load_si128((__m128i*)andmask1); + + __m128i *p_target, *p_src0; + + p_target = (__m128i*)target; + p_src0 = (__m128i*)src0; + + int bound = num_bytes >> 5; + int intermediate = (num_bytes >> 4) & 1; + int leftovers = (num_bytes >> 1) & 7; + + int i = 0; + + + for(i = 0; i < bound; ++i) { + + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(&p_src0[1]); + + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 2; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + xmm3 = _mm_and_si128(xmm2, xmm7); + + + xmm8 = _mm_add_epi8(xmm8, xmm4); + xmm3 = _mm_add_epi8(xmm3, xmm5); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm8); + xmm1 = _mm_shuffle_epi8(xmm1, xmm3); + + + xmm3 = _mm_add_epi16(xmm0, xmm1); + + + _mm_store_si128(p_target, xmm3); + + p_target += 1; + + } + + for(i = 0; i < intermediate; ++i) { + + xmm0 = _mm_load_si128(p_src0); + + + xmm2 = _mm_xor_si128(xmm2, xmm2); + p_src0 += 1; + + xmm3 = _mm_hsub_epi16(xmm0, xmm1); + xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); + + xmm8 = _mm_and_si128(xmm2, xmm6); + + xmm3 = _mm_add_epi8(xmm8, xmm4); + + xmm0 = _mm_shuffle_epi8(xmm0, xmm3); + + + _mm_storel_pd((double*)p_target, (__m128d)xmm0); + + p_target = (__m128i*)((int8_t*)p_target + 8); + + } + + for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { + target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; + } + + +} + +#endif /*LV_HAVE_SSSE3*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + + for(i = 0; i < bound; i += 2) { + target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; + } + +} + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h new file mode 100644 index 000000000..e52a949fb --- /dev/null +++ b/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h @@ -0,0 +1,139 @@ +#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a16_H +#define INCLUDED_volk_16i_permute_and_scalar_add_a16_H + + +#include +#include + + + + +#if LV_HAVE_SSE2 + +#include +#include + +static inline void volk_16i_permute_and_scalar_add_a16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; + + short* p_permute_indexes = permute_indexes; + + p_target = (__m128i*)target; + p_cntl0 = (__m128i*)cntl0; + p_cntl1 = (__m128i*)cntl1; + p_cntl2 = (__m128i*)cntl2; + p_cntl3 = (__m128i*)cntl3; + p_scalars = (__m128i*)scalars; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + xmm0 = _mm_load_si128(p_scalars); + + xmm1 = _mm_shufflelo_epi16(xmm0, 0); + xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); + xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); + xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); + + xmm1 = _mm_shuffle_epi32(xmm1, 0x00); + xmm2 = _mm_shuffle_epi32(xmm2, 0x00); + xmm3 = _mm_shuffle_epi32(xmm3, 0x00); + xmm4 = _mm_shuffle_epi32(xmm4, 0x00); + + + for(; i < bound; ++i) { + xmm0 = _mm_setzero_si128(); + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = _mm_setzero_si128(); + + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); + xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); + xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); + xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); + xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_permute_indexes += 8; + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + xmm5 = _mm_load_si128(p_cntl0); + xmm6 = _mm_load_si128(p_cntl1); + xmm7 = _mm_load_si128(p_cntl2); + + xmm5 = _mm_and_si128(xmm5, xmm1); + xmm6 = _mm_and_si128(xmm6, xmm2); + xmm7 = _mm_and_si128(xmm7, xmm3); + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + xmm5 = _mm_load_si128(p_cntl3); + + xmm6 = _mm_add_epi16(xmm6, xmm7); + + p_cntl0 += 1; + + xmm5 = _mm_and_si128(xmm5, xmm4); + + xmm0 = _mm_add_epi16(xmm0, xmm6); + + p_cntl1 += 1; + p_cntl2 += 1; + + xmm0 = _mm_add_epi16(xmm0, xmm5); + + p_cntl3 += 1; + + _mm_store_si128(p_target, xmm0); + + p_target += 1; + } + + + + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + } +} +#endif /*LV_HAVE_SSEs*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16i_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target[i] = src0[permute_indexes[i]] + + (cntl0[i] & scalars[0]) + + (cntl1[i] & scalars[1]) + + (cntl2[i] & scalars[2]) + + (cntl3[i] & scalars[3]); + + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a16_H*/ diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a16.h b/volk/include/volk/volk_16i_s32f_convert_32f_a16.h new file mode 100644 index 000000000..83fd26ff9 --- /dev/null +++ b/volk/include/volk/volk_16i_s32f_convert_32f_a16.h @@ -0,0 +1,119 @@ +#ifndef INCLUDED_volk_16i_s32f_convert_32f_a16_H +#define INCLUDED_volk_16i_s32f_convert_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_16i_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_u.h b/volk/include/volk/volk_16i_s32f_convert_32f_u.h new file mode 100644 index 000000000..8f0dd0083 --- /dev/null +++ b/volk/include/volk/volk_16i_s32f_convert_32f_u.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H +#define INCLUDED_volk_16i_s32f_convert_32f_u_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_16i_s32f_convert_32f_u_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */ diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h new file mode 100644 index 000000000..e4ec5ab4e --- /dev/null +++ b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h @@ -0,0 +1,191 @@ +#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H +#define INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H + + +#include +#include + + + + + +#if LV_HAVE_SSE2 + +#include + +static inline void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + + + + int i = 0; + + int bound = (num_bytes >> 4); + int bound_copy = bound; + int leftovers = (num_bytes >> 1) & 7; + + __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; + p_target = (__m128i*) target; + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + + + + __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + while(bound_copy > 0) { + + xmm1 = _mm_load_si128(p_src0); + xmm2 = _mm_load_si128(p_src1); + xmm3 = _mm_load_si128(p_src2); + xmm4 = _mm_load_si128(p_src3); + + xmm5 = _mm_setzero_si128(); + xmm6 = _mm_setzero_si128(); + xmm7 = xmm1; + xmm8 = xmm3; + + + xmm1 = _mm_sub_epi16(xmm2, xmm1); + + + + xmm3 = _mm_sub_epi16(xmm4, xmm3); + + xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); + xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); + + + + xmm2 = _mm_and_si128(xmm5, xmm2); + xmm4 = _mm_and_si128(xmm6, xmm4); + xmm5 = _mm_andnot_si128(xmm5, xmm7); + xmm6 = _mm_andnot_si128(xmm6, xmm8); + + xmm5 = _mm_add_epi16(xmm2, xmm5); + xmm6 = _mm_add_epi16(xmm4, xmm6); + + + xmm1 = _mm_xor_si128(xmm1, xmm1); + xmm2 = xmm5; + xmm5 = _mm_sub_epi16(xmm6, xmm5); + p_src0 += 1; + bound_copy -= 1; + + xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); + p_src1 += 1; + + xmm6 = _mm_and_si128(xmm1, xmm6); + + xmm1 = _mm_andnot_si128(xmm1, xmm2); + p_src2 += 1; + + + + xmm1 = _mm_add_epi16(xmm6, xmm1); + p_src3 += 1; + + + _mm_store_si128(p_target, xmm1); + p_target += 1; + + } + + + /*asm volatile + ( + "volk_16i_x4_quad_max_star_16i_a16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je volk_16i_x4_quad_max_star_16i_a16_sse2_END\n\t" + + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "movaps %%xmm1, %%xmm7\n\t" + "movaps %%xmm3, %%xmm8\n\t" + "psubw %%xmm2, %%xmm1\n\t" + "psubw %%xmm4, %%xmm3\n\t" + + "pcmpgtw %%xmm1, %%xmm5\n\t" + "pcmpgtw %%xmm3, %%xmm6\n\t" + + "pand %%xmm5, %%xmm2\n\t" + "pand %%xmm6, %%xmm4\n\t" + "pandn %%xmm7, %%xmm5\n\t" + "pandn %%xmm8, %%xmm6\n\t" + + "paddw %%xmm2, %%xmm5\n\t" + "paddw %%xmm4, %%xmm6\n\t" + + "pxor %%xmm1, %%xmm1\n\t" + "movaps %%xmm5, %%xmm2\n\t" + + "psubw %%xmm6, %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $-1, %[bound]\n\t" + + "pcmpgtw %%xmm5, %%xmm1\n\t" + "add $16, %[src1]\n\t" + + "pand %%xmm1, %%xmm6\n\t" + + "pandn %%xmm2, %%xmm1\n\t" + "add $16, %[src2]\n\t" + + "paddw %%xmm6, %%xmm1\n\t" + "add $16, %[src3]\n\t" + + "movaps %%xmm1, (%[target])\n\t" + "addw $16, %[target]\n\t" + "jmp volk_16i_x4_quad_max_star_16i_a16_sse2_L1\n\t" + + "volk_16i_x4_quad_max_star_16i_a16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) + : + ); + */ + + short temp0 = 0; + short temp1 = 0; + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } + return; + + +} + +#endif /*LV_HAVE_SSE2*/ + + +#if LV_HAVE_GENERIC +static inline void volk_16i_x4_quad_max_star_16i_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + short temp0 = 0; + short temp1 = 0; + for(i = 0; i < bound; ++i) { + temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; + temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; + target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; + } +} + + + + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H*/ diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h new file mode 100644 index 000000000..5744ca3a6 --- /dev/null +++ b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h @@ -0,0 +1,136 @@ +#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H +#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H + + +#include +#include + + + + + +#if LV_HAVE_SSE2 +#include +#include + +static inline void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; + p_target0 = (__m128i*)target0; + p_target1 = (__m128i*)target1; + p_target2 = (__m128i*)target2; + p_target3 = (__m128i*)target3; + + p_src0 = (__m128i*)src0; + p_src1 = (__m128i*)src1; + p_src2 = (__m128i*)src2; + p_src3 = (__m128i*)src3; + p_src4 = (__m128i*)src4; + + int i = 0; + + int bound = (num_bytes >> 4); + int leftovers = (num_bytes >> 1) & 7; + + for(; i < bound; ++i) { + xmm0 = _mm_load_si128(p_src0); + xmm1 = _mm_load_si128(p_src1); + xmm2 = _mm_load_si128(p_src2); + xmm3 = _mm_load_si128(p_src3); + xmm4 = _mm_load_si128(p_src4); + + p_src0 += 1; + p_src1 += 1; + + xmm1 = _mm_add_epi16(xmm0, xmm1); + xmm2 = _mm_add_epi16(xmm0, xmm2); + xmm3 = _mm_add_epi16(xmm0, xmm3); + xmm4 = _mm_add_epi16(xmm0, xmm4); + + + p_src2 += 1; + p_src3 += 1; + p_src4 += 1; + + _mm_store_si128(p_target0, xmm1); + _mm_store_si128(p_target1, xmm2); + _mm_store_si128(p_target2, xmm3); + _mm_store_si128(p_target3, xmm4); + + p_target0 += 1; + p_target1 += 1; + p_target2 += 1; + p_target3 += 1; + } + /*asm volatile + ( + ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1:\n\t" + "cmp $0, %[bound]\n\t" + "je .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END\n\t" + "movaps (%[src0]), %%xmm1\n\t" + "movaps (%[src1]), %%xmm2\n\t" + "movaps (%[src2]), %%xmm3\n\t" + "movaps (%[src3]), %%xmm4\n\t" + "movaps (%[src4]), %%xmm5\n\t" + "add $16, %[src0]\n\t" + "add $16, %[src1]\n\t" + "add $16, %[src2]\n\t" + "add $16, %[src3]\n\t" + "add $16, %[src4]\n\t" + "paddw %%xmm1, %%xmm2\n\t" + "paddw %%xmm1, %%xmm3\n\t" + "paddw %%xmm1, %%xmm4\n\t" + "paddw %%xmm1, %%xmm5\n\t" + "add $-1, %[bound]\n\t" + "movaps %%xmm2, (%[target0])\n\t" + "movaps %%xmm3, (%[target1])\n\t" + "movaps %%xmm4, (%[target2])\n\t" + "movaps %%xmm5, (%[target3])\n\t" + "add $16, %[target0]\n\t" + "add $16, %[target1]\n\t" + "add $16, %[target2]\n\t" + "add $16, %[target3]\n\t" + "jmp .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1\n\t" + ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END:\n\t" + : + :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) + :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + + */ + + + for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} +#endif /*LV_HAVE_SSE2*/ + + +#if LV_HAVE_GENERIC + +static inline void volk_16i_x5_add_quad_16i_x4_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { + + int i = 0; + + int bound = num_bytes >> 1; + + for(i = 0; i < bound; ++i) { + target0[i] = src0[i] + src1[i]; + target1[i] = src0[i] + src2[i]; + target2[i] = src0[i] + src3[i]; + target3[i] = src0[i] + src4[i]; + } +} + +#endif /* LV_HAVE_GENERIC */ + + + + + +#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H*/ diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h new file mode 100644 index 000000000..7e08bf182 --- /dev/null +++ b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H +#define INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_16i_x2_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); + __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); + qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = *int16ComplexVectorPtr++; + *qBufferPtr++ = *int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_16i_x2_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); + + iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); + + qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); + + qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); + + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16ic_deinterleave_16i_x2_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16ic_deinterleave_16i_x2_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_deinterleave_16i_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h new file mode 100644 index 000000000..388c00592 --- /dev/null +++ b/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a16_H +#define INCLUDED_volk_16ic_deinterleave_real_16i_a16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_16i_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + __m128i complexVal1, complexVal2, iOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + iOutputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_16i_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i complexVal1, complexVal2, iOutputVal; + __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); + __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; + + complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); + + complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); + + iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h new file mode 100644 index 000000000..437d5ab6b --- /dev/null +++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h @@ -0,0 +1,94 @@ +#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a16_H +#define INCLUDED_volk_16ic_deinterleave_real_8i_a16_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); + + complexVal1 = _mm_or_si128(complexVal1, complexVal2); + + complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); + complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); + + complexVal3 = _mm_or_si128(complexVal3, complexVal4); + + + complexVal1 = _mm_srai_epi16(complexVal1, 8); + complexVal3 = _mm_srai_epi16(complexVal3, 8); + + iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); + int16ComplexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (int16_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +extern void volk_16ic_deinterleave_real_8i_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); +static inline void volk_16ic_deinterleave_real_8i_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_deinterleave_real_8i_a16_orc_impl(iBuffer, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a16_H */ diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a16.h b/volk/include/volk/volk_16ic_magnitude_16i_a16.h new file mode 100644 index 000000000..bdcace750 --- /dev/null +++ b/volk/include/volk/volk_16ic_magnitude_16i_a16.h @@ -0,0 +1,190 @@ +#ifndef INCLUDED_volk_16ic_magnitude_16i_a16_H +#define INCLUDED_volk_16ic_magnitude_16i_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, result; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(32768.0); + __m128 invScalar = _mm_set_ps1(1.0/32768.0); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + float inputFloatBuffer[4] __attribute__((aligned(128))); + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue1 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + cplxValue2 = _mm_load_ps(inputFloatBuffer); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + result = _mm_mul_ps(result, vScalar); // Scale the results + + _mm_store_ps(outputFloatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + const float val1Real = (float)(*complexVectorPtr++) / 32768.0; + const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; + const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; + *magnitudeVectorPtr++ = (int16_t)(val1Result); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float scalar = 32768.0; + for(number = 0; number < num_points; number++){ + float real = ((float)(*complexVectorPtr++)) / scalar; + float imag = ((float)(*complexVectorPtr++)) / scalar; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC_DISABLED +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16ic_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); +static inline void volk_16ic_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ + volk_16ic_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_magnitude_16i_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h new file mode 100644 index 000000000..606de2fc5 --- /dev/null +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h @@ -0,0 +1,108 @@ +#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H +#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + uint64_t number = 0; + const uint64_t quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex data values to be deinterleaved + */ +extern void volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h new file mode 100644 index 000000000..62331e496 --- /dev/null +++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h @@ -0,0 +1,125 @@ +#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H +#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); + + for(;number < quarterPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi16_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; + sixteenTComplexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar = 1.0/scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 16 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_16ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h new file mode 100644 index 000000000..ae64efbeb --- /dev/null +++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h @@ -0,0 +1,179 @@ +#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H +#define INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + __m128 cplxValue1, cplxValue2, result; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) / scalar; + float val1Imag = (float)(*complexVectorPtr++) / scalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + + __m128 cplxValue1, cplxValue2, result, re, im; + + float inputFloatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + inputFloatBuffer[0] = (float)(complexVectorPtr[0]); + inputFloatBuffer[1] = (float)(complexVectorPtr[1]); + inputFloatBuffer[2] = (float)(complexVectorPtr[2]); + inputFloatBuffer[3] = (float)(complexVectorPtr[3]); + + inputFloatBuffer[4] = (float)(complexVectorPtr[4]); + inputFloatBuffer[5] = (float)(complexVectorPtr[5]); + inputFloatBuffer[6] = (float)(complexVectorPtr[6]); + inputFloatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); + cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); + + re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); + im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(re, invScalar); + cplxValue2 = _mm_mul_ps(im, invScalar); + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); // Square root the values + + _mm_store_ps(magnitudeVectorPtr, result); + + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + complexVectorPtr = (const int16_t*)&complexVector[number]; + for(; number < num_points; number++){ + float val1Real = (float)(*complexVectorPtr++) * iScalar; + float val1Imag = (float)(*complexVectorPtr++) * iScalar; + *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); + } +} + + +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_16ic_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + const int16_t* complexVectorPtr = (const int16_t*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + float real = ( (float) (*complexVectorPtr++)) * invScalar; + float imag = ( (float) (*complexVectorPtr++)) * invScalar; + *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC_DISABLED +/*! + \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param scalar The data value to be divided against each input data value of the input complex vector + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_16ic_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_16ic_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ + volk_16ic_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_16s_add_quad_a16.h b/volk/include/volk/volk_16s_add_quad_a16.h deleted file mode 100644 index 67d0c55a3..000000000 --- a/volk/include/volk/volk_16s_add_quad_a16.h +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef INCLUDED_volk_16s_add_quad_a16_H -#define INCLUDED_volk_16s_add_quad_a16_H - - -#include -#include - - - - - -#if LV_HAVE_SSE2 -#include -#include - -static inline void volk_16s_add_quad_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; - p_target0 = (__m128i*)target0; - p_target1 = (__m128i*)target1; - p_target2 = (__m128i*)target2; - p_target3 = (__m128i*)target3; - - p_src0 = (__m128i*)src0; - p_src1 = (__m128i*)src1; - p_src2 = (__m128i*)src2; - p_src3 = (__m128i*)src3; - p_src4 = (__m128i*)src4; - - int i = 0; - - int bound = (num_bytes >> 4); - int leftovers = (num_bytes >> 1) & 7; - - for(; i < bound; ++i) { - xmm0 = _mm_load_si128(p_src0); - xmm1 = _mm_load_si128(p_src1); - xmm2 = _mm_load_si128(p_src2); - xmm3 = _mm_load_si128(p_src3); - xmm4 = _mm_load_si128(p_src4); - - p_src0 += 1; - p_src1 += 1; - - xmm1 = _mm_add_epi16(xmm0, xmm1); - xmm2 = _mm_add_epi16(xmm0, xmm2); - xmm3 = _mm_add_epi16(xmm0, xmm3); - xmm4 = _mm_add_epi16(xmm0, xmm4); - - - p_src2 += 1; - p_src3 += 1; - p_src4 += 1; - - _mm_store_si128(p_target0, xmm1); - _mm_store_si128(p_target1, xmm2); - _mm_store_si128(p_target2, xmm3); - _mm_store_si128(p_target3, xmm4); - - p_target0 += 1; - p_target1 += 1; - p_target2 += 1; - p_target3 += 1; - } - /*asm volatile - ( - ".%=volk_16s_add_quad_a16_sse2_L1:\n\t" - "cmp $0, %[bound]\n\t" - "je .%=volk_16s_add_quad_a16_sse2_END\n\t" - "movaps (%[src0]), %%xmm1\n\t" - "movaps (%[src1]), %%xmm2\n\t" - "movaps (%[src2]), %%xmm3\n\t" - "movaps (%[src3]), %%xmm4\n\t" - "movaps (%[src4]), %%xmm5\n\t" - "add $16, %[src0]\n\t" - "add $16, %[src1]\n\t" - "add $16, %[src2]\n\t" - "add $16, %[src3]\n\t" - "add $16, %[src4]\n\t" - "paddw %%xmm1, %%xmm2\n\t" - "paddw %%xmm1, %%xmm3\n\t" - "paddw %%xmm1, %%xmm4\n\t" - "paddw %%xmm1, %%xmm5\n\t" - "add $-1, %[bound]\n\t" - "movaps %%xmm2, (%[target0])\n\t" - "movaps %%xmm3, (%[target1])\n\t" - "movaps %%xmm4, (%[target2])\n\t" - "movaps %%xmm5, (%[target3])\n\t" - "add $16, %[target0]\n\t" - "add $16, %[target1]\n\t" - "add $16, %[target2]\n\t" - "add $16, %[target3]\n\t" - "jmp .%=volk_16s_add_quad_a16_sse2_L1\n\t" - ".%=volk_16s_add_quad_a16_sse2_END:\n\t" - : - :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) - :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); - - */ - - - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target0[i] = src0[i] + src1[i]; - target1[i] = src0[i] + src2[i]; - target2[i] = src0[i] + src3[i]; - target3[i] = src0[i] + src4[i]; - } -} -#endif /*LV_HAVE_SSE2*/ - - -#if LV_HAVE_GENERIC - -static inline void volk_16s_add_quad_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - for(i = 0; i < bound; ++i) { - target0[i] = src0[i] + src1[i]; - target1[i] = src0[i] + src2[i]; - target2[i] = src0[i] + src3[i]; - target3[i] = src0[i] + src4[i]; - } -} - -#endif /* LV_HAVE_GENERIC */ - - - - - -#endif /*INCLUDED_volk_16s_add_quad_a16_H*/ diff --git a/volk/include/volk/volk_16s_branch_4_state_8_a16.h b/volk/include/volk/volk_16s_branch_4_state_8_a16.h deleted file mode 100644 index 4c1af8729..000000000 --- a/volk/include/volk/volk_16s_branch_4_state_8_a16.h +++ /dev/null @@ -1,194 +0,0 @@ -#ifndef INCLUDED_volk_16s_branch_4_state_8_a16_H -#define INCLUDED_volk_16s_branch_4_state_8_a16_H - - -#include -#include - - - - -#if LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16s_branch_4_state_8_a16_ssse3(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { - - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11; - - __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars; - - - - p_target = (__m128i*)target; - p_src0 = (__m128i*)src0; - p_cntl2 = (__m128i*)cntl2; - p_cntl3 = (__m128i*)cntl3; - p_scalars = (__m128i*)scalars; - - int i = 0; - - int bound = 1; - - - xmm0 = _mm_load_si128(p_scalars); - - xmm1 = _mm_shufflelo_epi16(xmm0, 0); - xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); - xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); - xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - - xmm1 = _mm_shuffle_epi32(xmm1, 0x00); - xmm2 = _mm_shuffle_epi32(xmm2, 0x00); - xmm3 = _mm_shuffle_epi32(xmm3, 0x00); - xmm4 = _mm_shuffle_epi32(xmm4, 0x00); - - xmm0 = _mm_load_si128((__m128i*)permuters[0]); - xmm6 = _mm_load_si128((__m128i*)permuters[1]); - xmm8 = _mm_load_si128((__m128i*)permuters[2]); - xmm10 = _mm_load_si128((__m128i*)permuters[3]); - - for(; i < bound; ++i) { - - xmm5 = _mm_load_si128(p_src0); - - - - - - - - - - xmm0 = _mm_shuffle_epi8(xmm5, xmm0); - xmm6 = _mm_shuffle_epi8(xmm5, xmm6); - xmm8 = _mm_shuffle_epi8(xmm5, xmm8); - xmm10 = _mm_shuffle_epi8(xmm5, xmm10); - - p_src0 += 4; - - - xmm5 = _mm_add_epi16(xmm1, xmm2); - - xmm6 = _mm_add_epi16(xmm2, xmm6); - xmm8 = _mm_add_epi16(xmm1, xmm8); - - - xmm7 = _mm_load_si128(p_cntl2); - xmm9 = _mm_load_si128(p_cntl3); - - xmm0 = _mm_add_epi16(xmm5, xmm0); - - - xmm7 = _mm_and_si128(xmm7, xmm3); - xmm9 = _mm_and_si128(xmm9, xmm4); - - xmm5 = _mm_load_si128(&p_cntl2[1]); - xmm11 = _mm_load_si128(&p_cntl3[1]); - - xmm7 = _mm_add_epi16(xmm7, xmm9); - - xmm5 = _mm_and_si128(xmm5, xmm3); - xmm11 = _mm_and_si128(xmm11, xmm4); - - xmm0 = _mm_add_epi16(xmm0, xmm7); - - - - xmm7 = _mm_load_si128(&p_cntl2[2]); - xmm9 = _mm_load_si128(&p_cntl3[2]); - - xmm5 = _mm_add_epi16(xmm5, xmm11); - - xmm7 = _mm_and_si128(xmm7, xmm3); - xmm9 = _mm_and_si128(xmm9, xmm4); - - xmm6 = _mm_add_epi16(xmm6, xmm5); - - - xmm5 = _mm_load_si128(&p_cntl2[3]); - xmm11 = _mm_load_si128(&p_cntl3[3]); - - xmm7 = _mm_add_epi16(xmm7, xmm9); - - xmm5 = _mm_and_si128(xmm5, xmm3); - xmm11 = _mm_and_si128(xmm11, xmm4); - - xmm8 = _mm_add_epi16(xmm8, xmm7); - - xmm5 = _mm_add_epi16(xmm5, xmm11); - - _mm_store_si128(p_target, xmm0); - _mm_store_si128(&p_target[1], xmm6); - - xmm10 = _mm_add_epi16(xmm5, xmm10); - - _mm_store_si128(&p_target[2], xmm8); - - _mm_store_si128(&p_target[3], xmm10); - - p_target += 3; - } -} - - -#endif /*LV_HAVE_SSEs*/ - -#if LV_HAVE_GENERIC -static inline void volk_16s_branch_4_state_8_a16_generic(short* target, short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) { - int i = 0; - - int bound = 4; - - for(; i < bound; ++i) { - target[i* 8] = src0[((char)permuters[i][0])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8] & scalars[2]) - + (cntl3[i * 8] & scalars[3]); - target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 1] & scalars[2]) - + (cntl3[i * 8 + 1] & scalars[3]); - target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 2] & scalars[2]) - + (cntl3[i * 8 + 2] & scalars[3]); - target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 3] & scalars[2]) - + (cntl3[i * 8 + 3] & scalars[3]); - target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 4] & scalars[2]) - + (cntl3[i * 8 + 4] & scalars[3]); - target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 5] & scalars[2]) - + (cntl3[i * 8 + 5] & scalars[3]); - target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 6] & scalars[2]) - + (cntl3[i * 8 + 6] & scalars[3]); - target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] - + ((i + 1)%2 * scalars[0]) - + (((i >> 1)^1) * scalars[1]) - + (cntl2[i * 8 + 7] & scalars[2]) - + (cntl3[i * 8 + 7] & scalars[3]); - - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_16s_branch_4_state_8_a16_H*/ diff --git a/volk/include/volk/volk_16s_convert_8s_a16.h b/volk/include/volk/volk_16s_convert_8s_a16.h deleted file mode 100644 index 13db435de..000000000 --- a/volk/include/volk/volk_16s_convert_8s_a16.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef INCLUDED_volk_16s_convert_8s_a16_H -#define INCLUDED_volk_16s_convert_8s_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16s_convert_8s_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal1; - __m128i inputVal2; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - - // Load the 16 values - inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8; - - inputVal1 = _mm_srai_epi16(inputVal1, 8); - inputVal2 = _mm_srai_epi16(inputVal2, 8); - - ret = _mm_packs_epi16(inputVal1, inputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, ret); - - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted -*/ -static inline void volk_16s_convert_8s_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16s_convert_8s_a16_H */ diff --git a/volk/include/volk/volk_16s_convert_8s_ua16.h b/volk/include/volk/volk_16s_convert_8s_ua16.h deleted file mode 100644 index 9941118ae..000000000 --- a/volk/include/volk/volk_16s_convert_8s_ua16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_volk_16s_convert_8s_ua16_H -#define INCLUDED_volk_16s_convert_8s_ua16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned -*/ -static inline void volk_16s_convert_8s_ua16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - int8_t* outputVectorPtr = outputVector; - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal1; - __m128i inputVal2; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - - // Load the 16 values - inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; - inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8; - - inputVal1 = _mm_srai_epi16(inputVal1, 8); - inputVal2 = _mm_srai_epi16(inputVal2, 8); - - ret = _mm_packs_epi16(inputVal1, inputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, ret); - - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] =(int8_t)(inputVector[number] >> 8); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the input 16 bit integer data into 8 bit integer data - \param inputVector The 16 bit input data buffer - \param outputVector The 8 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned -*/ -static inline void volk_16s_convert_8s_ua16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16s_convert_8s_ua16_H */ diff --git a/volk/include/volk/volk_16s_max_star_16s_a16.h b/volk/include/volk/volk_16s_max_star_16s_a16.h deleted file mode 100644 index b2ec90552..000000000 --- a/volk/include/volk/volk_16s_max_star_16s_a16.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef INCLUDED_volk_16s_max_star_16s_a16_H -#define INCLUDED_volk_16s_max_star_16s_a16_H - - -#include -#include - - -#if LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16s_max_star_16s_a16_ssse3(short* target, short* src0, unsigned int num_bytes) { - - - - short candidate = src0[0]; - short cands[8]; - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; - - - __m128i *p_src0; - - p_src0 = (__m128i*)src0; - - int bound = num_bytes >> 4; - int leftovers = (num_bytes >> 1) & 7; - - int i = 0; - - - xmm1 = _mm_setzero_si128(); - xmm0 = _mm_setzero_si128(); - //_mm_insert_epi16(xmm0, candidate, 0); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm1); - - - for(i = 0; i < bound; ++i) { - xmm1 = _mm_load_si128(p_src0); - p_src0 += 1; - xmm2 = _mm_sub_epi16(xmm1, xmm0); - - - - - - - xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); - xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); - xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); - - xmm6 = _mm_xor_si128(xmm4, xmm5); - - xmm3 = _mm_and_si128(xmm3, xmm0); - xmm4 = _mm_and_si128(xmm6, xmm1); - - xmm0 = _mm_add_epi16(xmm3, xmm4); - - - } - - _mm_store_si128((__m128i*)cands, xmm0); - - for(i = 0; i < 8; ++i) { - candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; - } - - - - for(i = 0; i < leftovers; ++i) { - - candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i]; - } - - target[0] = candidate; - - - - - -} - -#endif /*LV_HAVE_SSSE3*/ - -#if LV_HAVE_GENERIC - -static inline void volk_16s_max_star_16s_a16_generic(short* target, short* src0, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - short candidate = src0[0]; - for(i = 1; i < bound; ++i) { - candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; - } - target[0] = candidate; - -} - - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_16s_max_star_16s_a16_H*/ diff --git a/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h b/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h deleted file mode 100644 index 68994593b..000000000 --- a/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h +++ /dev/null @@ -1,130 +0,0 @@ -#ifndef INCLUDED_volk_16s_max_star_horizontal_16s_a16_H -#define INCLUDED_volk_16s_max_star_horizontal_16s_a16_H - - -#include -#include - - -#if LV_HAVE_SSSE3 - -#include -#include -#include - -static inline void volk_16s_max_star_horizontal_16s_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) { - - const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; - const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; - const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; - - - - volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; - __m128i xmm5, xmm6, xmm7, xmm8; - - xmm4 = _mm_load_si128((__m128i*)shufmask0); - xmm5 = _mm_load_si128((__m128i*)shufmask1); - xmm6 = _mm_load_si128((__m128i*)andmask0); - xmm7 = _mm_load_si128((__m128i*)andmask1); - - __m128i *p_target, *p_src0; - - p_target = (__m128i*)target; - p_src0 = (__m128i*)src0; - - int bound = num_bytes >> 5; - int intermediate = (num_bytes >> 4) & 1; - int leftovers = (num_bytes >> 1) & 7; - - int i = 0; - - - for(i = 0; i < bound; ++i) { - - xmm0 = _mm_load_si128(p_src0); - xmm1 = _mm_load_si128(&p_src0[1]); - - - - xmm2 = _mm_xor_si128(xmm2, xmm2); - p_src0 += 2; - - xmm3 = _mm_hsub_epi16(xmm0, xmm1); - - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); - - xmm8 = _mm_and_si128(xmm2, xmm6); - xmm3 = _mm_and_si128(xmm2, xmm7); - - - xmm8 = _mm_add_epi8(xmm8, xmm4); - xmm3 = _mm_add_epi8(xmm3, xmm5); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm8); - xmm1 = _mm_shuffle_epi8(xmm1, xmm3); - - - xmm3 = _mm_add_epi16(xmm0, xmm1); - - - _mm_store_si128(p_target, xmm3); - - p_target += 1; - - } - - for(i = 0; i < intermediate; ++i) { - - xmm0 = _mm_load_si128(p_src0); - - - xmm2 = _mm_xor_si128(xmm2, xmm2); - p_src0 += 1; - - xmm3 = _mm_hsub_epi16(xmm0, xmm1); - xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); - - xmm8 = _mm_and_si128(xmm2, xmm6); - - xmm3 = _mm_add_epi8(xmm8, xmm4); - - xmm0 = _mm_shuffle_epi8(xmm0, xmm3); - - - _mm_storel_pd((double*)p_target, (__m128d)xmm0); - - p_target = (__m128i*)((int8_t*)p_target + 8); - - } - - for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { - target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; - } - - -} - -#endif /*LV_HAVE_SSSE3*/ - - -#if LV_HAVE_GENERIC -static inline void volk_16s_max_star_horizontal_16s_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - - for(i = 0; i < bound; i += 2) { - target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; - } - -} - - - -#endif /*LV_HAVE_GENERIC*/ - -#endif /*INCLUDED_volk_16s_max_star_horizontal_16s_a16_H*/ diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h deleted file mode 100644 index 2e7586b57..000000000 --- a/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h +++ /dev/null @@ -1,139 +0,0 @@ -#ifndef INCLUDED_volk_16s_permute_and_scalar_add_a16_H -#define INCLUDED_volk_16s_permute_and_scalar_add_a16_H - - -#include -#include - - - - -#if LV_HAVE_SSE2 - -#include -#include - -static inline void volk_16s_permute_and_scalar_add_a16_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - - - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; - - short* p_permute_indexes = permute_indexes; - - p_target = (__m128i*)target; - p_cntl0 = (__m128i*)cntl0; - p_cntl1 = (__m128i*)cntl1; - p_cntl2 = (__m128i*)cntl2; - p_cntl3 = (__m128i*)cntl3; - p_scalars = (__m128i*)scalars; - - int i = 0; - - int bound = (num_bytes >> 4); - int leftovers = (num_bytes >> 1) & 7; - - xmm0 = _mm_load_si128(p_scalars); - - xmm1 = _mm_shufflelo_epi16(xmm0, 0); - xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); - xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); - xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); - - xmm1 = _mm_shuffle_epi32(xmm1, 0x00); - xmm2 = _mm_shuffle_epi32(xmm2, 0x00); - xmm3 = _mm_shuffle_epi32(xmm3, 0x00); - xmm4 = _mm_shuffle_epi32(xmm4, 0x00); - - - for(; i < bound; ++i) { - xmm0 = _mm_setzero_si128(); - xmm5 = _mm_setzero_si128(); - xmm6 = _mm_setzero_si128(); - xmm7 = _mm_setzero_si128(); - - xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); - xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); - xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); - xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); - xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); - xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); - xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); - xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); - - xmm0 = _mm_add_epi16(xmm0, xmm5); - xmm6 = _mm_add_epi16(xmm6, xmm7); - - p_permute_indexes += 8; - - xmm0 = _mm_add_epi16(xmm0, xmm6); - - xmm5 = _mm_load_si128(p_cntl0); - xmm6 = _mm_load_si128(p_cntl1); - xmm7 = _mm_load_si128(p_cntl2); - - xmm5 = _mm_and_si128(xmm5, xmm1); - xmm6 = _mm_and_si128(xmm6, xmm2); - xmm7 = _mm_and_si128(xmm7, xmm3); - - xmm0 = _mm_add_epi16(xmm0, xmm5); - - xmm5 = _mm_load_si128(p_cntl3); - - xmm6 = _mm_add_epi16(xmm6, xmm7); - - p_cntl0 += 1; - - xmm5 = _mm_and_si128(xmm5, xmm4); - - xmm0 = _mm_add_epi16(xmm0, xmm6); - - p_cntl1 += 1; - p_cntl2 += 1; - - xmm0 = _mm_add_epi16(xmm0, xmm5); - - p_cntl3 += 1; - - _mm_store_si128(p_target, xmm0); - - p_target += 1; - } - - - - - - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - target[i] = src0[permute_indexes[i]] - + (cntl0[i] & scalars[0]) - + (cntl1[i] & scalars[1]) - + (cntl2[i] & scalars[2]) - + (cntl3[i] & scalars[3]); - } -} -#endif /*LV_HAVE_SSEs*/ - - -#if LV_HAVE_GENERIC -static inline void volk_16s_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - for(i = 0; i < bound; ++i) { - target[i] = src0[permute_indexes[i]] - + (cntl0[i] & scalars[0]) - + (cntl1[i] & scalars[1]) - + (cntl2[i] & scalars[2]) - + (cntl3[i] & scalars[3]); - - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_16s_permute_and_scalar_add_a16_H*/ diff --git a/volk/include/volk/volk_16s_quad_max_star_16s_a16.h b/volk/include/volk/volk_16s_quad_max_star_16s_a16.h deleted file mode 100644 index 3e89ff963..000000000 --- a/volk/include/volk/volk_16s_quad_max_star_16s_a16.h +++ /dev/null @@ -1,191 +0,0 @@ -#ifndef INCLUDED_volk_16s_quad_max_star_16s_a16_H -#define INCLUDED_volk_16s_quad_max_star_16s_a16_H - - -#include -#include - - - - - -#if LV_HAVE_SSE2 - -#include - -static inline void volk_16s_quad_max_star_16s_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - - - - - int i = 0; - - int bound = (num_bytes >> 4); - int bound_copy = bound; - int leftovers = (num_bytes >> 1) & 7; - - __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; - p_target = (__m128i*) target; - p_src0 = (__m128i*)src0; - p_src1 = (__m128i*)src1; - p_src2 = (__m128i*)src2; - p_src3 = (__m128i*)src3; - - - - __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - - while(bound_copy > 0) { - - xmm1 = _mm_load_si128(p_src0); - xmm2 = _mm_load_si128(p_src1); - xmm3 = _mm_load_si128(p_src2); - xmm4 = _mm_load_si128(p_src3); - - xmm5 = _mm_setzero_si128(); - xmm6 = _mm_setzero_si128(); - xmm7 = xmm1; - xmm8 = xmm3; - - - xmm1 = _mm_sub_epi16(xmm2, xmm1); - - - - xmm3 = _mm_sub_epi16(xmm4, xmm3); - - xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); - xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); - - - - xmm2 = _mm_and_si128(xmm5, xmm2); - xmm4 = _mm_and_si128(xmm6, xmm4); - xmm5 = _mm_andnot_si128(xmm5, xmm7); - xmm6 = _mm_andnot_si128(xmm6, xmm8); - - xmm5 = _mm_add_epi16(xmm2, xmm5); - xmm6 = _mm_add_epi16(xmm4, xmm6); - - - xmm1 = _mm_xor_si128(xmm1, xmm1); - xmm2 = xmm5; - xmm5 = _mm_sub_epi16(xmm6, xmm5); - p_src0 += 1; - bound_copy -= 1; - - xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); - p_src1 += 1; - - xmm6 = _mm_and_si128(xmm1, xmm6); - - xmm1 = _mm_andnot_si128(xmm1, xmm2); - p_src2 += 1; - - - - xmm1 = _mm_add_epi16(xmm6, xmm1); - p_src3 += 1; - - - _mm_store_si128(p_target, xmm1); - p_target += 1; - - } - - - /*asm volatile - ( - "volk_16s_quad_max_star_16s_a16_sse2_L1:\n\t" - "cmp $0, %[bound]\n\t" - "je volk_16s_quad_max_star_16s_a16_sse2_END\n\t" - - "movaps (%[src0]), %%xmm1\n\t" - "movaps (%[src1]), %%xmm2\n\t" - "movaps (%[src2]), %%xmm3\n\t" - "movaps (%[src3]), %%xmm4\n\t" - - "pxor %%xmm5, %%xmm5\n\t" - "pxor %%xmm6, %%xmm6\n\t" - "movaps %%xmm1, %%xmm7\n\t" - "movaps %%xmm3, %%xmm8\n\t" - "psubw %%xmm2, %%xmm1\n\t" - "psubw %%xmm4, %%xmm3\n\t" - - "pcmpgtw %%xmm1, %%xmm5\n\t" - "pcmpgtw %%xmm3, %%xmm6\n\t" - - "pand %%xmm5, %%xmm2\n\t" - "pand %%xmm6, %%xmm4\n\t" - "pandn %%xmm7, %%xmm5\n\t" - "pandn %%xmm8, %%xmm6\n\t" - - "paddw %%xmm2, %%xmm5\n\t" - "paddw %%xmm4, %%xmm6\n\t" - - "pxor %%xmm1, %%xmm1\n\t" - "movaps %%xmm5, %%xmm2\n\t" - - "psubw %%xmm6, %%xmm5\n\t" - "add $16, %[src0]\n\t" - "add $-1, %[bound]\n\t" - - "pcmpgtw %%xmm5, %%xmm1\n\t" - "add $16, %[src1]\n\t" - - "pand %%xmm1, %%xmm6\n\t" - - "pandn %%xmm2, %%xmm1\n\t" - "add $16, %[src2]\n\t" - - "paddw %%xmm6, %%xmm1\n\t" - "add $16, %[src3]\n\t" - - "movaps %%xmm1, (%[target])\n\t" - "addw $16, %[target]\n\t" - "jmp volk_16s_quad_max_star_16s_a16_sse2_L1\n\t" - - "volk_16s_quad_max_star_16s_a16_sse2_END:\n\t" - : - :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) - : - ); - */ - - short temp0 = 0; - short temp1 = 0; - for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { - temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; - temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; - target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - } - return; - - -} - -#endif /*LV_HAVE_SSE2*/ - - -#if LV_HAVE_GENERIC -static inline void volk_16s_quad_max_star_16s_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) { - - int i = 0; - - int bound = num_bytes >> 1; - - short temp0 = 0; - short temp1 = 0; - for(i = 0; i < bound; ++i) { - temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; - temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; - target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; - } -} - - - - -#endif /*LV_HAVE_GENERIC*/ - -#endif /*INCLUDED_volk_16s_quad_max_star_16s_a16_H*/ diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_a16.h b/volk/include/volk/volk_16s_s32f_convert_32f_a16.h deleted file mode 100644 index 8f9b44478..000000000 --- a/volk/include/volk/volk_16s_s32f_convert_32f_a16.h +++ /dev/null @@ -1,119 +0,0 @@ -#ifndef INCLUDED_volk_16s_s32f_convert_32f_a16_H -#define INCLUDED_volk_16s_s32f_convert_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; - - for(;number < eighthPoints; number++){ - - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); - - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - - inputPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16s_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_16s_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16s_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h deleted file mode 100644 index ad52aea1a..000000000 --- a/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef INCLUDED_volk_16s_s32f_convert_32f_ua16_H -#define INCLUDED_volk_16s_s32f_convert_32f_ua16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_16s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; - - for(;number < eighthPoints; number++){ - - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); - - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - - inputPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_16s_s32f_convert_32f_ua16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_16s_s32f_convert_32f_ua16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16s_s32f_convert_32f_ua16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h deleted file mode 100644 index 8e5da24ec..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H -#define INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_16s_16s_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2); - __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i complexVal1, complexVal2, iOutputVal, qOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2)); - qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = *int16ComplexVectorPtr++; - *qBufferPtr++ = *int16ComplexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#if LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_16s_16s_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal; - __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); - __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0)); - - iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0)); - - qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1)); - - qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask)); - - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Deinterleaves the complex 16 bit vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -extern void volk_16sc_deinterleave_16s_16s_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16sc_deinterleave_16s_16s_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16sc_deinterleave_16s_16s_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h deleted file mode 100644 index 068c1350c..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_volk_16sc_deinterleave_real_16s_a16_H -#define INCLUDED_volk_16sc_deinterleave_real_16s_a16_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_16s_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - - __m128i complexVal1, complexVal2, iOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); - - iOutputVal = _mm_or_si128(complexVal1, complexVal2); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - - -#if LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_16s_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i complexVal1, complexVal2, iOutputVal; - __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF); - __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0); - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 8; - - complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0)); - - complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1)); - - iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask)); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16sc_deinterleave_real_16s_a16_H */ diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h deleted file mode 100644 index afa21ebc4..000000000 --- a/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef INCLUDED_volk_16sc_deinterleave_real_8s_a16_H -#define INCLUDED_volk_16sc_deinterleave_real_8s_a16_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2); - - complexVal1 = _mm_or_si128(complexVal1, complexVal2); - - complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1); - complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2); - - complexVal3 = _mm_or_si128(complexVal3, complexVal4); - - - complexVal1 = _mm_srai_epi16(complexVal1, 8); - complexVal3 = _mm_srai_epi16(complexVal3, 8); - - iOutputVal = _mm_packs_epi16(complexVal1, complexVal3); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); - int16ComplexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256); - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -extern void volk_16sc_deinterleave_real_8s_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points); -static inline void volk_16sc_deinterleave_real_8s_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16sc_deinterleave_real_8s_a16_orc_impl(iBuffer, complexVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16sc_deinterleave_real_8s_a16_H */ diff --git a/volk/include/volk/volk_16sc_magnitude_16s_a16.h b/volk/include/volk/volk_16sc_magnitude_16s_a16.h deleted file mode 100644 index d832de5fe..000000000 --- a/volk/include/volk/volk_16sc_magnitude_16s_a16.h +++ /dev/null @@ -1,190 +0,0 @@ -#ifndef INCLUDED_volk_16sc_magnitude_16s_a16_H -#define INCLUDED_volk_16sc_magnitude_16s_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(32768.0); - __m128 invScalar = _mm_set_ps1(1.0/32768.0); - - __m128 cplxValue1, cplxValue2, result; - - float inputFloatBuffer[8] __attribute__((aligned(128))); - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - result = _mm_mul_ps(result, vScalar); // Scale the results - - _mm_store_ps(outputFloatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / 32768.0; - const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; - *magnitudeVectorPtr++ = (int16_t)(val1Result); - } -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(32768.0); - __m128 invScalar = _mm_set_ps1(1.0/32768.0); - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - - float inputFloatBuffer[4] __attribute__((aligned(128))); - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - cplxValue1 = _mm_load_ps(inputFloatBuffer); - complexVectorPtr += 4; - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - cplxValue2 = _mm_load_ps(inputFloatBuffer); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - result = _mm_mul_ps(result, vScalar); // Scale the results - - _mm_store_ps(outputFloatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - const float val1Real = (float)(*complexVectorPtr++) / 32768.0; - const float val1Imag = (float)(*complexVectorPtr++) / 32768.0; - const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0; - *magnitudeVectorPtr++ = (int16_t)(val1Result); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - const float scalar = 32768.0; - for(number = 0; number < num_points; number++){ - float real = ((float)(*complexVectorPtr++)) / scalar; - float imag = ((float)(*complexVectorPtr++)) / scalar; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC_DISABLED -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_16sc_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points); -static inline void volk_16sc_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){ - volk_16sc_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16sc_magnitude_16s_a16_H */ diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h deleted file mode 100644 index 53e4253c4..000000000 --- a/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H -#define INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - uint64_t number = 0; - const uint64_t quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; - - float floatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - floatBuffer[0] = (float)(complexVectorPtr[0]); - floatBuffer[1] = (float)(complexVectorPtr[1]); - floatBuffer[2] = (float)(complexVectorPtr[2]); - floatBuffer[3] = (float)(complexVectorPtr[3]); - - floatBuffer[4] = (float)(complexVectorPtr[4]); - floatBuffer[5] = (float)(complexVectorPtr[5]); - floatBuffer[6] = (float)(complexVectorPtr[6]); - floatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&floatBuffer[0]); - cplxValue2 = _mm_load_ps(&floatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex data values to be deinterleaved - */ -extern void volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h deleted file mode 100644 index 7320db368..000000000 --- a/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h +++ /dev/null @@ -1,125 +0,0 @@ -#ifndef INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H -#define INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0); - - for(;number < quarterPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - iIntVal = _mm_cvtepi16_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar; - sixteenTComplexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 iValue; - - const float iScalar = 1.0/scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int16_t* complexVectorPtr = (int16_t*)complexVector; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; - - iValue = _mm_load_ps(floatBuffer); - - iValue = _mm_mul_ps(iValue, invScalar); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 16 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_16sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* iBufferPtr = iBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h deleted file mode 100644 index 649b5cc96..000000000 --- a/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h +++ /dev/null @@ -1,179 +0,0 @@ -#ifndef INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H -#define INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - - __m128 cplxValue1, cplxValue2, result; - - float inputFloatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - _mm_store_ps(magnitudeVectorPtr, result); - - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) / scalar; - float val1Imag = (float)(*complexVectorPtr++) / scalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - - __m128 cplxValue1, cplxValue2, result, re, im; - - float inputFloatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - inputFloatBuffer[0] = (float)(complexVectorPtr[0]); - inputFloatBuffer[1] = (float)(complexVectorPtr[1]); - inputFloatBuffer[2] = (float)(complexVectorPtr[2]); - inputFloatBuffer[3] = (float)(complexVectorPtr[3]); - - inputFloatBuffer[4] = (float)(complexVectorPtr[4]); - inputFloatBuffer[5] = (float)(complexVectorPtr[5]); - inputFloatBuffer[6] = (float)(complexVectorPtr[6]); - inputFloatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]); - cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]); - - re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88); - im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(re, invScalar); - cplxValue2 = _mm_mul_ps(im, invScalar); - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); // Square root the values - - _mm_store_ps(magnitudeVectorPtr, result); - - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - complexVectorPtr = (const int16_t*)&complexVector[number]; - for(; number < num_points; number++){ - float val1Real = (float)(*complexVectorPtr++) * iScalar; - float val1Imag = (float)(*complexVectorPtr++) * iScalar; - *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)); - } -} - - -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_16sc_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - const int16_t* complexVectorPtr = (const int16_t*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - float real = ( (float) (*complexVectorPtr++)) * invScalar; - float imag = ( (float) (*complexVectorPtr++)) * invScalar; - *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag)); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC_DISABLED -/*! - \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param scalar The data value to be divided against each input data value of the input complex vector - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_16sc_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_16sc_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ - volk_16sc_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h deleted file mode 100644 index a0f97f94e..000000000 --- a/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h +++ /dev/null @@ -1,151 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H -#define INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H - -#include -#include -#include - -#ifndef MAX -#define MAX(X,Y) ((X) > (Y)?(X):(Y)) -#endif - -#if LV_HAVE_SSE3 -#include -#include - -static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - - - float result = 0.0; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - //float fith = 0.0; - - - - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; - - xmm9 = _mm_setzero_ps(); - xmm1 = _mm_setzero_ps(); - - xmm0 = _mm_load1_ps(¢er_point_array[0]); - xmm6 = _mm_load1_ps(¢er_point_array[1]); - xmm7 = _mm_load1_ps(¢er_point_array[2]); - xmm8 = _mm_load1_ps(¢er_point_array[3]); - //xmm11 = _mm_load1_ps(¢er_point_array[4]); - xmm10 = _mm_load1_ps(cutoff); - - int bound = num_bytes >> 4; - int leftovers = (num_bytes >> 2) & 3; - int i = 0; - - for(; i < bound; ++i) { - xmm2 = _mm_load_ps(src0); - xmm2 = _mm_max_ps(xmm10, xmm2); - xmm3 = _mm_mul_ps(xmm2, xmm2); - xmm4 = _mm_mul_ps(xmm2, xmm3); - xmm5 = _mm_mul_ps(xmm3, xmm3); - //xmm12 = _mm_mul_ps(xmm3, xmm4); - - xmm2 = _mm_mul_ps(xmm2, xmm0); - xmm3 = _mm_mul_ps(xmm3, xmm6); - xmm4 = _mm_mul_ps(xmm4, xmm7); - xmm5 = _mm_mul_ps(xmm5, xmm8); - //xmm12 = _mm_mul_ps(xmm12, xmm11); - - xmm2 = _mm_add_ps(xmm2, xmm3); - xmm3 = _mm_add_ps(xmm4, xmm5); - - src0 += 4; - - xmm9 = _mm_add_ps(xmm2, xmm9); - - xmm1 = _mm_add_ps(xmm3, xmm1); - - //xmm9 = _mm_add_ps(xmm12, xmm9); - } - - xmm2 = _mm_hadd_ps(xmm9, xmm1); - xmm3 = _mm_hadd_ps(xmm2, xmm2); - xmm4 = _mm_hadd_ps(xmm3, xmm3); - - _mm_store_ss(&result, xmm4); - - - - for(i = 0; i < leftovers; ++i) { - fst = src0[i]; - fst = MAX(fst, *cutoff); - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth);// + - //center_point_array[4] * fith); - } - - result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; - - target[0] = result; -} - - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_GENERIC - -static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { - - - - float result = 0.0; - float fst = 0.0; - float sq = 0.0; - float thrd = 0.0; - float frth = 0.0; - //float fith = 0.0; - - - - int i = 0; - - for(; i < num_bytes >> 2; ++i) { - fst = src0[i]; - fst = MAX(fst, *cutoff); - - sq = fst * fst; - thrd = fst * sq; - frth = sq * sq; - //fith = sq * thrd; - - result += (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth); //+ - //center_point_array[4] * fith); - /*printf("%f12...%d\n", (center_point_array[0] * fst + - center_point_array[1] * sq + - center_point_array[2] * thrd + - center_point_array[3] * frth) + - //center_point_array[4] * fith) + - (center_point_array[4]), i); - */ - } - - result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); - - - - *target = result; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_32f_add_32f_a16.h b/volk/include/volk/volk_32f_32f_add_32f_a16.h deleted file mode 100644 index ba38c310f..000000000 --- a/volk/include/volk/volk_32f_32f_add_32f_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_add_32f_a16_H -#define INCLUDED_volk_32f_32f_add_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_32f_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_add_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_32f_32f_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -extern void volk_32f_32f_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_32f_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_32f_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_32f_add_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_divide_32f_a16.h b/volk/include/volk/volk_32f_32f_divide_32f_a16.h deleted file mode 100644 index a0995e631..000000000 --- a/volk/include/volk/volk_32f_32f_divide_32f_a16.h +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_divide_32f_a16_H -#define INCLUDED_volk_32f_32f_divide_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -static inline void volk_32f_32f_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_div_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -static inline void volk_32f_32f_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) / (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Divides the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be divideed - \param bVector The divisor vector - \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector -*/ -extern void volk_32f_32f_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_32f_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_32f_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_volk_32f_32f_divide_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h deleted file mode 100644 index 63f5221d3..000000000 --- a/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h +++ /dev/null @@ -1,184 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_a16_H -#define INCLUDED_volk_32f_32f_dot_prod_32f_a16_H - -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32f_32f_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE - - -static inline void volk_32f_32f_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_SSE*/ - -#if LV_HAVE_SSE3 - -#include - -static inline void volk_32f_32f_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_SSE4_1 - -#include - -static inline void volk_32f_32f_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - - bVal1 = _mm_load_ps(bPtr); bPtr += 4; - bVal2 = _mm_load_ps(bPtr); bPtr += 4; - bVal3 = _mm_load_ps(bPtr); bPtr += 4; - bVal4 = _mm_load_ps(bPtr); bPtr += 4; - - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } - - float dotProductVector[4] __attribute__((aligned(16))); - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h deleted file mode 100644 index b5fa7d7a4..000000000 --- a/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h +++ /dev/null @@ -1,184 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H -#define INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H - -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32f_32f_dot_prod_32f_ua16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr= taps; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE - - -static inline void volk_32f_32f_dot_prod_32f_ua16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_add_ps(cVal, dotProdVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; - -} - -#endif /*LV_HAVE_SSE*/ - -#if LV_HAVE_SSE3 - -#include - -static inline void volk_32f_32f_dot_prod_32f_ua16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal, bVal, cVal; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - dotProdVal = _mm_hadd_ps(dotProdVal, cVal); - - aPtr += 4; - bPtr += 4; - } - - float dotProductVector[4] __attribute__((aligned(16))); - dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); - - _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_SSE4_1 - -#include - -static inline void volk_32f_32f_dot_prod_32f_ua16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float dotProduct = 0; - const float* aPtr = input; - const float* bPtr = taps; - - __m128 aVal1, bVal1, cVal1; - __m128 aVal2, bVal2, cVal2; - __m128 aVal3, bVal3, cVal3; - __m128 aVal4, bVal4, cVal4; - - __m128 dotProdVal = _mm_setzero_ps(); - - for(;number < sixteenthPoints; number++){ - - aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; - aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; - - bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; - bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; - - cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); - cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); - cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); - cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - dotProdVal = _mm_add_ps(dotProdVal, cVal1); - } - - float dotProductVector[4] __attribute__((aligned(16))); - _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector - - dotProduct = dotProductVector[0]; - dotProduct += dotProductVector[1]; - dotProduct += dotProductVector[2]; - dotProduct += dotProductVector[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - dotProduct += ((*aPtr++) * (*bPtr++)); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H*/ diff --git a/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h b/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h deleted file mode 100644 index 34ea93349..000000000 --- a/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_interleave_32fc_a16_H -#define INCLUDED_volk_32f_32f_interleave_32fc_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Interleaves the I & Q vector data into the complex vector - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param num_points The number of complex data values to be interleaved -*/ -static inline void volk_32f_32f_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ - unsigned int number = 0; - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - const uint64_t quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue; - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue = _mm_unpacklo_ps(iValue, qValue); - _mm_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 4; - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue = _mm_unpackhi_ps(iValue, qValue); - _mm_store_ps(complexVectorPtr, cplxValue); - complexVectorPtr += 4; - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Interleaves the I & Q vector data into the complex vector. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param num_points The number of complex data values to be interleaved -*/ -static inline void volk_32f_32f_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ - float* complexVectorPtr = (float*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - unsigned int number; - - for(number = 0; number < num_points; number++){ - *complexVectorPtr++ = *iBufferPtr++; - *complexVectorPtr++ = *qBufferPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_32f_interleave_32fc_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_max_32f_a16.h b/volk/include/volk/volk_32f_32f_max_32f_a16.h deleted file mode 100644 index 8ca7a5ba8..000000000 --- a/volk/include/volk/volk_32f_32f_max_32f_a16.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_max_32f_a16_H -#define INCLUDED_volk_32f_32f_max_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_32f_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_max_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_32f_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -extern void volk_32f_32f_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_32f_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_32f_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_32f_max_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_min_32f_a16.h b/volk/include/volk/volk_32f_32f_min_32f_a16.h deleted file mode 100644 index dd05988be..000000000 --- a/volk/include/volk/volk_32f_32f_min_32f_a16.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_min_32f_a16_H -#define INCLUDED_volk_32f_32f_min_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_32f_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_min_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_32f_32f_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const float a = *aPtr++; - const float b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -extern void volk_32f_32f_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_32f_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_32f_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_32f_min_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_multiply_32f_a16.h b/volk/include/volk/volk_32f_32f_multiply_32f_a16.h deleted file mode 100644 index 2d004db10..000000000 --- a/volk/include/volk/volk_32f_32f_multiply_32f_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_multiply_32f_a16_H -#define INCLUDED_volk_32f_32f_multiply_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_32f_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_mul_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_32f_32f_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Multiplys the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector -*/ -extern void volk_32f_32f_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_32f_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_32f_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_32f_multiply_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h b/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h deleted file mode 100644 index 207382a19..000000000 --- a/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h +++ /dev/null @@ -1,155 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H -#define INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - const unsigned int quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue1, cplxValue2; - __m128i intValue1, intValue2; - - int16_t* complexVectorPtr = (int16_t*)complexVector; - - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue1 = _mm_unpacklo_ps(iValue, qValue); - cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue2 = _mm_unpackhi_ps(iValue, qValue); - cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); - - intValue1 = _mm_cvtps_epi32(cplxValue1); - intValue2 = _mm_cvtps_epi32(cplxValue2); - - intValue1 = _mm_packs_epi32(intValue1, intValue2); - - _mm_store_si128((__m128i*)complexVectorPtr, intValue1); - complexVectorPtr += 8; - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } - -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - const unsigned int quarterPoints = num_points / 4; - - __m128 iValue, qValue, cplxValue; - - int16_t* complexVectorPtr = (int16_t*)complexVector; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - iValue = _mm_load_ps(iBufferPtr); - qValue = _mm_load_ps(qBufferPtr); - - // Interleaves the lower two values in the i and q variables into one buffer - cplxValue = _mm_unpacklo_ps(iValue, qValue); - cplxValue = _mm_mul_ps(cplxValue, vScalar); - - _mm_store_ps(floatBuffer, cplxValue); - - *complexVectorPtr++ = (int16_t)(floatBuffer[0]); - *complexVectorPtr++ = (int16_t)(floatBuffer[1]); - *complexVectorPtr++ = (int16_t)(floatBuffer[2]); - *complexVectorPtr++ = (int16_t)(floatBuffer[3]); - - // Interleaves the upper two values in the i and q variables into one buffer - cplxValue = _mm_unpackhi_ps(iValue, qValue); - cplxValue = _mm_mul_ps(cplxValue, vScalar); - - _mm_store_ps(floatBuffer, cplxValue); - - *complexVectorPtr++ = (int16_t)(floatBuffer[0]); - *complexVectorPtr++ = (int16_t)(floatBuffer[1]); - *complexVectorPtr++ = (int16_t)(floatBuffer[2]); - *complexVectorPtr++ = (int16_t)(floatBuffer[3]); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int16_t*)(&complexVector[number]); - for(; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } - -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. - \param iBuffer The I buffer data to be interleaved - \param qBuffer The Q buffer data to be interleaved - \param complexVector The complex output vector - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be interleaved - */ -static inline void volk_32f_32f_s32f_interleave_16sc_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ - int16_t* complexVectorPtr = (int16_t*)complexVector; - const float* iBufferPtr = iBuffer; - const float* qBufferPtr = qBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); - *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H */ diff --git a/volk/include/volk/volk_32f_32f_subtract_32f_a16.h b/volk/include/volk/volk_32f_32f_subtract_32f_a16.h deleted file mode 100644 index 9fea6aa27..000000000 --- a/volk/include/volk/volk_32f_32f_subtract_32f_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32f_32f_subtract_32f_a16_H -#define INCLUDED_volk_32f_32f_subtract_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -static inline void volk_32f_32f_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_sub_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -static inline void volk_32f_32f_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) - (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Subtracts bVector form aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The initial vector - \param bVector The vector to be subtracted - \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector -*/ -extern void volk_32f_32f_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_32f_32f_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_32f_32f_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32f_32f_subtract_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_u.h b/volk/include/volk/volk_32f_convert_64f_u.h new file mode 100644 index 000000000..a825767de --- /dev/null +++ b/volk/include/volk/volk_32f_convert_64f_u.h @@ -0,0 +1,70 @@ +#ifndef INCLUDED_volk_32f_convert_64f_u_H +#define INCLUDED_volk_32f_convert_64f_u_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + double* outputVectorPtr = outputVector; + __m128d ret; + __m128 inputVal; + + for(;number < quarterPoints; number++){ + inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + + inputVal = _mm_movehl_ps(inputVal, inputVal); + + ret = _mm_cvtps_pd(inputVal); + + _mm_storeu_pd(outputVectorPtr, ret); + outputVectorPtr += 2; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (double)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the float values into double values + \param dVector The converted double vector values + \param fVector The float vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_32f_convert_64f_u_generic(double* outputVector, const float* inputVector, unsigned int num_points){ + double* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((double)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_convert_64f_u_H */ diff --git a/volk/include/volk/volk_32f_convert_64f_ua16.h b/volk/include/volk/volk_32f_convert_64f_ua16.h deleted file mode 100644 index c8de768dc..000000000 --- a/volk/include/volk/volk_32f_convert_64f_ua16.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INCLUDED_volk_32f_convert_64f_ua16_H -#define INCLUDED_volk_32f_convert_64f_ua16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_32f_convert_64f_ua16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - double* outputVectorPtr = outputVector; - __m128d ret; - __m128 inputVal; - - for(;number < quarterPoints; number++){ - inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - ret = _mm_cvtps_pd(inputVal); - - _mm_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - - inputVal = _mm_movehl_ps(inputVal, inputVal); - - ret = _mm_cvtps_pd(inputVal); - - _mm_storeu_pd(outputVectorPtr, ret); - outputVectorPtr += 2; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (double)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the float values into double values - \param dVector The converted double vector values - \param fVector The float vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_32f_convert_64f_ua16_generic(double* outputVector, const float* inputVector, unsigned int num_points){ - double* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((double)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_convert_64f_ua16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h new file mode 100644 index 000000000..d6b16e336 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h @@ -0,0 +1,110 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_16i_a16_H +#define INCLUDED_volk_32f_s32f_convert_16i_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_16i_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_16i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h new file mode 100644 index 000000000..4d306e53c --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h @@ -0,0 +1,113 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H +#define INCLUDED_volk_32f_s32f_convert_16i_u_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16s_a16.h b/volk/include/volk/volk_32f_s32f_convert_16s_a16.h deleted file mode 100644 index cf51cf9c5..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_16s_a16.h +++ /dev/null @@ -1,110 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_16s_a16_H -#define INCLUDED_volk_32f_s32f_convert_16s_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16s_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16s_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_16s_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_16s_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h deleted file mode 100644 index 53d159f82..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h +++ /dev/null @@ -1,113 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_16s_ua16_H -#define INCLUDED_volk_32f_s32f_convert_16s_ua16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_16s_ua16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_16s_ua16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_16s_ua16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_16s_ua16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h new file mode 100644 index 000000000..ae874fd7b --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h @@ -0,0 +1,106 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_32i_a16_H +#define INCLUDED_volk_32f_s32f_convert_32i_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_32i_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_32i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h new file mode 100644 index 000000000..561fcd800 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h @@ -0,0 +1,109 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H +#define INCLUDED_volk_32f_s32f_convert_32i_u_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1; + __m128i intInputVal1; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int32_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int32_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 32 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_32i_u_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int32_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32s_a16.h b/volk/include/volk/volk_32f_s32f_convert_32s_a16.h deleted file mode 100644 index 0be649418..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_32s_a16.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_32s_a16_H -#define INCLUDED_volk_32f_s32f_convert_32s_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_32s_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1; - __m128i intInputVal1; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_32s_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_32s_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int32_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_32s_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h deleted file mode 100644 index efb2c3a20..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_32s_ua16_H -#define INCLUDED_volk_32f_s32f_convert_32s_ua16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_32s_ua16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1; - __m128i intInputVal1; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_32s_ua16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int32_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int32_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 32 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_32s_ua16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int32_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_32s_ua16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h new file mode 100644 index 000000000..c91448951 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h @@ -0,0 +1,117 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_8i_a16_H +#define INCLUDED_volk_32f_s32f_convert_8i_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); + intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_8i_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h new file mode 100644 index 000000000..420693571 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h @@ -0,0 +1,120 @@ +#ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H +#define INCLUDED_volk_32f_s32f_convert_8i_u_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int sixteenthPoints = num_points / 16; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + + for(;number < sixteenthPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); + intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); + intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); + intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); + + intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int8_t* outputVectorPtr = outputVector; + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + + float outputFloatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + ret = _mm_mul_ps(ret, vScalar); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); + *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (int8_t)(inputVector[number] * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 8 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int8_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8s_a16.h b/volk/include/volk/volk_32f_s32f_convert_8s_a16.h deleted file mode 100644 index 69ccec5c6..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_8s_a16.h +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_8s_a16_H -#define INCLUDED_volk_32f_s32f_convert_8s_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8s_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); - - intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8s_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32f_s32f_convert_8s_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_8s_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h deleted file mode 100644 index af1652b19..000000000 --- a/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h +++ /dev/null @@ -1,120 +0,0 @@ -#ifndef INCLUDED_volk_32f_s32f_convert_8s_ua16_H -#define INCLUDED_volk_32f_s32f_convert_8s_ua16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_8s_ua16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int sixteenthPoints = num_points / 16; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2, inputVal3, inputVal4; - __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; - - for(;number < sixteenthPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar)); - intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar)); - intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar)); - intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar)); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4); - - intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_8s_ua16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int8_t* outputVectorPtr = outputVector; - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - - float outputFloatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - ret = _mm_mul_ps(ret, vScalar); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]); - *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (int8_t)(inputVector[number] * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 8 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_32f_s32f_convert_8s_ua16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int8_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_s32f_convert_8s_ua16_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h deleted file mode 100644 index 2ba809845..000000000 --- a/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h +++ /dev/null @@ -1,169 +0,0 @@ -#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H -#define INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_32f_32f_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const float* aPtr = inputBuffer; - float meanBuffer[4] __attribute__((aligned(128))); - float squareBuffer[4] __attribute__((aligned(128))); - - __m128 accumulator = _mm_setzero_ps(); - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal1, aVal2, aVal3, aVal4; - __m128 cVal1, cVal2, cVal3, cVal4; - for(;number < sixteenthPoints; number++) { - aVal1 = _mm_load_ps(aPtr); aPtr += 4; - cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); - accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x - - aVal2 = _mm_load_ps(aPtr); aPtr += 4; - cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); - accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x - - aVal3 = _mm_load_ps(aPtr); aPtr += 4; - cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); - accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x - - aVal4 = _mm_load_ps(aPtr); aPtr += 4; - cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); - accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x - - cVal1 = _mm_or_ps(cVal1, cVal2); - cVal3 = _mm_or_ps(cVal3, cVal4); - cVal1 = _mm_or_ps(cVal1, cVal3); - - squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 - } - _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = sixteenthPoints * 16; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_32f_32f_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - float meanBuffer[4] __attribute__((aligned(128))); - float squareBuffer[4] __attribute__((aligned(128))); - - __m128 accumulator = _mm_setzero_ps(); - __m128 squareAccumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - for(;number < quarterPoints; number++) { - aVal = _mm_load_ps(aPtr); // aVal = x - accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x - aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 - squareAccumulator = _mm_add_ps(squareAccumulator, aVal); - aPtr += 4; - } - _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container - _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container - newMean = meanBuffer[0]; - newMean += meanBuffer[1]; - newMean += meanBuffer[2]; - newMean += meanBuffer[3]; - returnValue = squareBuffer[0]; - returnValue += squareBuffer[1]; - returnValue += squareBuffer[2]; - returnValue += squareBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the standard deviation and mean of the input buffer - \param stddev The calculated standard deviation - \param mean The mean of the input buffer - \param inputBuffer The buffer of points to calculate the std deviation for - \param num_points The number of values in input buffer to used in the stddev and mean calculations -*/ -static inline void volk_32f_stddev_and_mean_32f_32f_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - float newMean = 0; - if(num_points > 0){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - returnValue += (*aPtr) * (*aPtr); - newMean += *aPtr++; - } - newMean /= num_points; - returnValue /= num_points; - returnValue -= (newMean * newMean); - returnValue = sqrt(returnValue); - } - *stddev = returnValue; - *mean = newMean; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h new file mode 100644 index 000000000..278089841 --- /dev/null +++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h @@ -0,0 +1,169 @@ +#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H +#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = inputBuffer; + float meanBuffer[4] __attribute__((aligned(128))); + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal1, aVal2, aVal3, aVal4; + __m128 cVal1, cVal2, cVal3, cVal4; + for(;number < sixteenthPoints; number++) { + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1); + accumulator = _mm_add_ps(accumulator, aVal1); // accumulator += x + + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2); + accumulator = _mm_add_ps(accumulator, aVal2); // accumulator += x + + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4); + accumulator = _mm_add_ps(accumulator, aVal3); // accumulator += x + + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8); + accumulator = _mm_add_ps(accumulator, aVal4); // accumulator += x + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2 + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + float meanBuffer[4] __attribute__((aligned(128))); + float squareBuffer[4] __attribute__((aligned(128))); + + __m128 accumulator = _mm_setzero_ps(); + __m128 squareAccumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + for(;number < quarterPoints; number++) { + aVal = _mm_load_ps(aPtr); // aVal = x + accumulator = _mm_add_ps(accumulator, aVal); // accumulator += x + aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2 + squareAccumulator = _mm_add_ps(squareAccumulator, aVal); + aPtr += 4; + } + _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container + _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container + newMean = meanBuffer[0]; + newMean += meanBuffer[1]; + newMean += meanBuffer[2]; + newMean += meanBuffer[3]; + returnValue = squareBuffer[0]; + returnValue += squareBuffer[1]; + returnValue += squareBuffer[2]; + returnValue += squareBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the standard deviation and mean of the input buffer + \param stddev The calculated standard deviation + \param mean The mean of the input buffer + \param inputBuffer The buffer of points to calculate the std deviation for + \param num_points The number of values in input buffer to used in the stddev and mean calculations +*/ +static inline void volk_32f_stddev_and_mean_32f_x2_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + float newMean = 0; + if(num_points > 0){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + returnValue += (*aPtr) * (*aPtr); + newMean += *aPtr++; + } + newMean /= num_points; + returnValue /= num_points; + returnValue -= (newMean * newMean); + returnValue = sqrt(returnValue); + } + *stddev = returnValue; + *mean = newMean; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_add_32f_a16.h b/volk/include/volk/volk_32f_x2_add_32f_a16.h new file mode 100644 index 000000000..d0d0e0a0e --- /dev/null +++ b/volk/include/volk/volk_32f_x2_add_32f_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_x2_add_32f_a16_H +#define INCLUDED_volk_32f_x2_add_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_32f_x2_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +extern void volk_32f_x2_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_add_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a16.h b/volk/include/volk/volk_32f_x2_divide_32f_a16.h new file mode 100644 index 000000000..d844e25b0 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_divide_32f_a16.h @@ -0,0 +1,82 @@ +#ifndef INCLUDED_volk_32f_x2_divide_32f_a16_H +#define INCLUDED_volk_32f_x2_divide_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_x2_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_div_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +static inline void volk_32f_x2_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) / (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Divides the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be divideed + \param bVector The divisor vector + \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector +*/ +extern void volk_32f_x2_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_volk_32f_x2_divide_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h new file mode 100644 index 000000000..61aa56815 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h @@ -0,0 +1,184 @@ +#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a16_H +#define INCLUDED_volk_32f_x2_dot_prod_32f_a16_H + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE + + +static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_add_ps(cVal, dotProdVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32f_x2_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_load_ps(aPtr); aPtr += 4; + aVal2 = _mm_load_ps(aPtr); aPtr += 4; + aVal3 = _mm_load_ps(aPtr); aPtr += 4; + aVal4 = _mm_load_ps(aPtr); aPtr += 4; + + bVal1 = _mm_load_ps(bPtr); bPtr += 4; + bVal2 = _mm_load_ps(bPtr); bPtr += 4; + bVal3 = _mm_load_ps(bPtr); bPtr += 4; + bVal4 = _mm_load_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + float dotProductVector[4] __attribute__((aligned(16))); + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a16_H*/ diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h new file mode 100644 index 000000000..8469a3cea --- /dev/null +++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h @@ -0,0 +1,184 @@ +#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H +#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) { + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr= taps; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE + + +static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float* input, const float* taps, unsigned int num_points) { + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_add_ps(cVal, dotProdVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal, bVal, cVal; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + dotProdVal = _mm_hadd_ps(dotProdVal, cVal); + + aPtr += 4; + bPtr += 4; + } + + float dotProductVector[4] __attribute__((aligned(16))); + dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal); + + _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) { + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float dotProduct = 0; + const float* aPtr = input; + const float* bPtr = taps; + + __m128 aVal1, bVal1, cVal1; + __m128 aVal2, bVal2, cVal2; + __m128 aVal3, bVal3, cVal3; + __m128 aVal4, bVal4, cVal4; + + __m128 dotProdVal = _mm_setzero_ps(); + + for(;number < sixteenthPoints; number++){ + + aVal1 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal2 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal3 = _mm_loadu_ps(aPtr); aPtr += 4; + aVal4 = _mm_loadu_ps(aPtr); aPtr += 4; + + bVal1 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal2 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal3 = _mm_loadu_ps(bPtr); bPtr += 4; + bVal4 = _mm_loadu_ps(bPtr); bPtr += 4; + + cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1); + cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2); + cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4); + cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8); + + cVal1 = _mm_or_ps(cVal1, cVal2); + cVal3 = _mm_or_ps(cVal3, cVal4); + cVal1 = _mm_or_ps(cVal1, cVal3); + + dotProdVal = _mm_add_ps(dotProdVal, cVal1); + } + + float dotProductVector[4] __attribute__((aligned(16))); + _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector + + dotProduct = dotProductVector[0]; + dotProduct += dotProductVector[1]; + dotProduct += dotProductVector[2]; + dotProduct += dotProductVector[3]; + + number = sixteenthPoints * 16; + for(;number < num_points; number++){ + dotProduct += ((*aPtr++) * (*bPtr++)); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/ diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h new file mode 100644 index 000000000..29c9392df --- /dev/null +++ b/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a16_H +#define INCLUDED_volk_32f_x2_interleave_32fc_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Interleaves the I & Q vector data into the complex vector + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_x2_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + unsigned int number = 0; + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + const uint64_t quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + _mm_store_ps(complexVectorPtr, cplxValue); + complexVectorPtr += 4; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Interleaves the I & Q vector data into the complex vector. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param num_points The number of complex data values to be interleaved +*/ +static inline void volk_32f_x2_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){ + float* complexVectorPtr = (float*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = *iBufferPtr++; + *complexVectorPtr++ = *qBufferPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_max_32f_a16.h b/volk/include/volk/volk_32f_x2_max_32f_a16.h new file mode 100644 index 000000000..26e7f1246 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_max_32f_a16.h @@ -0,0 +1,85 @@ +#ifndef INCLUDED_volk_32f_x2_max_32f_a16_H +#define INCLUDED_volk_32f_x2_max_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_max_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_x2_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_max_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_min_32f_a16.h b/volk/include/volk/volk_32f_x2_min_32f_a16.h new file mode 100644 index 000000000..23bae044c --- /dev/null +++ b/volk/include/volk/volk_32f_x2_min_32f_a16.h @@ -0,0 +1,85 @@ +#ifndef INCLUDED_volk_32f_x2_min_32f_a16_H +#define INCLUDED_volk_32f_x2_min_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_min_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_32f_x2_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const float a = *aPtr++; + const float b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +extern void volk_32f_x2_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_min_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h new file mode 100644 index 000000000..a0dcfa86e --- /dev/null +++ b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_x2_multiply_32f_a16_H +#define INCLUDED_volk_32f_x2_multiply_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_mul_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_32f_x2_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Multiplys the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector +*/ +extern void volk_32f_x2_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_multiply_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h new file mode 100644 index 000000000..30306774d --- /dev/null +++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h @@ -0,0 +1,155 @@ +#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H +#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue1, cplxValue2; + __m128i intValue1, intValue2; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); + cplxValue1 = _mm_mul_ps(cplxValue1, vScalar); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); + cplxValue2 = _mm_mul_ps(cplxValue2, vScalar); + + intValue1 = _mm_cvtps_epi32(cplxValue1); + intValue2 = _mm_cvtps_epi32(cplxValue2); + + intValue1 = _mm_packs_epi32(intValue1, intValue2); + + _mm_store_si128((__m128i*)complexVectorPtr, intValue1); + complexVectorPtr += 8; + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_SSE +#include + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + const unsigned int quarterPoints = num_points / 4; + + __m128 iValue, qValue, cplxValue; + + int16_t* complexVectorPtr = (int16_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + iValue = _mm_load_ps(iBufferPtr); + qValue = _mm_load_ps(qBufferPtr); + + // Interleaves the lower two values in the i and q variables into one buffer + cplxValue = _mm_unpacklo_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + // Interleaves the upper two values in the i and q variables into one buffer + cplxValue = _mm_unpackhi_ps(iValue, qValue); + cplxValue = _mm_mul_ps(cplxValue, vScalar); + + _mm_store_ps(floatBuffer, cplxValue); + + *complexVectorPtr++ = (int16_t)(floatBuffer[0]); + *complexVectorPtr++ = (int16_t)(floatBuffer[1]); + *complexVectorPtr++ = (int16_t)(floatBuffer[2]); + *complexVectorPtr++ = (int16_t)(floatBuffer[3]); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int16_t*)(&complexVector[number]); + for(; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data. + \param iBuffer The I buffer data to be interleaved + \param qBuffer The Q buffer data to be interleaved + \param complexVector The complex output vector + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be interleaved + */ +static inline void volk_32f_x2_s32f_interleave_16ic_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){ + int16_t* complexVectorPtr = (int16_t*)complexVector; + const float* iBufferPtr = iBuffer; + const float* qBufferPtr = qBuffer; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar); + *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H */ diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a16.h b/volk/include/volk/volk_32f_x2_subtract_32f_a16.h new file mode 100644 index 000000000..7404bfe79 --- /dev/null +++ b/volk/include/volk/volk_32f_x2_subtract_32f_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32f_x2_subtract_32f_a16_H +#define INCLUDED_volk_32f_x2_subtract_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_x2_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_sub_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +static inline void volk_32f_x2_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) - (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Subtracts bVector form aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The initial vector + \param bVector The vector to be subtracted + \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector +*/ +extern void volk_32f_x2_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_32f_x2_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_32f_x2_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32f_x2_subtract_32f_a16_H */ diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h new file mode 100644 index 000000000..af9e39537 --- /dev/null +++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h @@ -0,0 +1,151 @@ +#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H +#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H + +#include +#include +#include + +#ifndef MAX +#define MAX(X,Y) ((X) > (Y)?(X):(Y)) +#endif + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32f_x3_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; + + xmm9 = _mm_setzero_ps(); + xmm1 = _mm_setzero_ps(); + + xmm0 = _mm_load1_ps(¢er_point_array[0]); + xmm6 = _mm_load1_ps(¢er_point_array[1]); + xmm7 = _mm_load1_ps(¢er_point_array[2]); + xmm8 = _mm_load1_ps(¢er_point_array[3]); + //xmm11 = _mm_load1_ps(¢er_point_array[4]); + xmm10 = _mm_load1_ps(cutoff); + + int bound = num_bytes >> 4; + int leftovers = (num_bytes >> 2) & 3; + int i = 0; + + for(; i < bound; ++i) { + xmm2 = _mm_load_ps(src0); + xmm2 = _mm_max_ps(xmm10, xmm2); + xmm3 = _mm_mul_ps(xmm2, xmm2); + xmm4 = _mm_mul_ps(xmm2, xmm3); + xmm5 = _mm_mul_ps(xmm3, xmm3); + //xmm12 = _mm_mul_ps(xmm3, xmm4); + + xmm2 = _mm_mul_ps(xmm2, xmm0); + xmm3 = _mm_mul_ps(xmm3, xmm6); + xmm4 = _mm_mul_ps(xmm4, xmm7); + xmm5 = _mm_mul_ps(xmm5, xmm8); + //xmm12 = _mm_mul_ps(xmm12, xmm11); + + xmm2 = _mm_add_ps(xmm2, xmm3); + xmm3 = _mm_add_ps(xmm4, xmm5); + + src0 += 4; + + xmm9 = _mm_add_ps(xmm2, xmm9); + + xmm1 = _mm_add_ps(xmm3, xmm1); + + //xmm9 = _mm_add_ps(xmm12, xmm9); + } + + xmm2 = _mm_hadd_ps(xmm9, xmm1); + xmm3 = _mm_hadd_ps(xmm2, xmm2); + xmm4 = _mm_hadd_ps(xmm3, xmm3); + + _mm_store_ss(&result, xmm4); + + + + for(i = 0; i < leftovers; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth);// + + //center_point_array[4] * fith); + } + + result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; + + target[0] = result; +} + + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC + +static inline void volk_32f_x3_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) { + + + + float result = 0.0; + float fst = 0.0; + float sq = 0.0; + float thrd = 0.0; + float frth = 0.0; + //float fith = 0.0; + + + + int i = 0; + + for(; i < num_bytes >> 2; ++i) { + fst = src0[i]; + fst = MAX(fst, *cutoff); + + sq = fst * fst; + thrd = fst * sq; + frth = sq * sq; + //fith = sq * thrd; + + result += (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth); //+ + //center_point_array[4] * fith); + /*printf("%f12...%d\n", (center_point_array[0] * fst + + center_point_array[1] * sq + + center_point_array[2] * thrd + + center_point_array[3] * frth) + + //center_point_array[4] * fith) + + (center_point_array[4]), i); + */ + } + + result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); + + + + *target = result; +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h deleted file mode 100644 index cd9cc8160..000000000 --- a/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h +++ /dev/null @@ -1,344 +0,0 @@ -#ifndef INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H -#define INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H - -#include -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - - sum0[0] += in[0] * tp[0] + in[1] * tp[1]; - sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] + in[3] * tp[3]; - sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); - - } - /* - for(i = 0; i < num_bytes >> 3; ++i) { - *result += input[i] * conjf(taps[i]); - } - */ -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE && LV_HAVE_64 - - -static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - - - - asm volatile - ( - "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %[conjugator], %%r9\n\t" - " movq %%rcx, %%rax\n\t" - " movaps 0(%%r9), %%xmm8\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movups 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " xorps %%xmm8, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " xorps %%xmm8, %%xmm3\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - " xorps %%xmm8, %%xmm2\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) - :"rax", "r8", "r9", "r10" - ); - - - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); - - } - - return; -} -#endif - -#if LV_HAVE_SSE && LV_HAVE_32 -static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - int bound = num_bytes >> 4; - int leftovers = num_bytes % 16; - - - asm volatile - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " #movl 12(%%ebp), %%eax # input\n\t" - " #movl 16(%%ebp), %%edx # taps\n\t" - " #movl 20(%%ebp), %%ecx # n_bytes\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%[eax]), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%[edx]), %%xmm2\n\t" - " movl %[ecx], (%[out])\n\t" - " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" - - " xorps %%xmm1, %%xmm2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%[eax]), %%xmmA\n\t" - "# movaps (%[edx]), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%[edx]), %%xmm3\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " xorps %%xmm1, %%xmm3\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " movaps 16(%[eax]), %%xmm1\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " movaps 0(%[conjugator]), %%xmm1\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%[eax]), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %[eax]\n\t" - " movaps 32(%[edx]), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " xorps %%xmm1, %%xmm2\n\t" - " addl $32, %[edx]\n\t" - ".%=L1_test:\n\t" - " decl %[ecx]\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" - " shrl $4, %[ecx]\n\t" - " andl $1, %[ecx]\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " #movl 8(%%ebp), %[eax] \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%[out])\n\t" - " movss (%[out]), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %[eax] # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) - ); - - - - - printf("%d, %d\n", leftovers, bound); - - for(; leftovers > 0; leftovers -= 8) { - - - *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); - - } - - return; - - - - - - -} - -#endif /*LV_HAVE_SSE*/ - - - -#endif /*INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h deleted file mode 100644 index 2ccfcf2f2..000000000 --- a/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h +++ /dev/null @@ -1,468 +0,0 @@ -#ifndef INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H -#define INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H - -#include -#include -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32fc_32fc_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; - sum0[1] += in[0] * tp[1] + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] - in[3] * tp[3]; - sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; - - } - -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE && LV_HAVE_64 - - -static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - - asm - ( - "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %%rcx, %%rax\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) - :"rax", "r8", "r9", "r10" - ); - - - int getem = num_bytes % 16; - - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - - } - - return; - -} - -#endif - -#if LV_HAVE_SSE && LV_HAVE_32 - -static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - asm volatile - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " movl 12(%%ebp), %%eax # input\n\t" - " movl 16(%%ebp), %%edx # taps\n\t" - " movl 20(%%ebp), %%ecx # n_bytes\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%eax), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%edx), %%xmm2\n\t" - " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%eax), %%xmmA\n\t" - "# movaps (%%edx), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%eax), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%edx), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%eax), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %%eax\n\t" - " movaps 32(%%edx), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " addl $32, %%edx\n\t" - ".%=L1_test:\n\t" - " decl %%ecx\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" - " shrl $4, %%ecx\n\t" - " andl $1, %%ecx\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " movl 8(%%ebp), %%eax \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%%eax)\n\t" - " movss (%%eax), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %%eax # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : - : "eax", "ecx", "edx" - ); - - - int getem = num_bytes % 16; - - for(; getem > 0; getem -= 8) { - - - *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); - - } - - return; - - - - - - -} - -#endif /*LV_HAVE_SSE*/ - -#if LV_HAVE_SSE3 - -#include - -static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); - - unsigned int number = 0; - const unsigned int halfPoints = num_bytes >> 4; - - __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; - - dotProdVal = _mm_setzero_ps(); - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together - - a += 2; - b += 2; - } - - lv_32fc_t dotProductVector[2] __attribute__((aligned(16))); - - _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct += ( dotProductVector[0] + dotProductVector[1] ); - - if((num_bytes >> 2) != 0) { - dotProduct += (*a) * (*b); - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_SSE4_1 - -#include - -static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - volk_32fc_32fc_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes); - // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now - /* - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; - float *p_input, *p_taps; - __m64 *p_result; - - p_result = (__m64*)result; - p_input = (float*)input; - p_taps = (float*)taps; - - static const __m128i neg = {0x000000000000000080000000}; - - int i = 0; - - int bound = (num_bytes >> 5); - int leftovers = (num_bytes & 24) >> 3; - - real0 = _mm_sub_ps(real0, real0); - real1 = _mm_sub_ps(real1, real1); - im0 = _mm_sub_ps(im0, im0); - im1 = _mm_sub_ps(im1, im1); - - for(; i < bound; ++i) { - - - xmm0 = _mm_load_ps(p_input); - xmm1 = _mm_load_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm2 = _mm_load_ps(p_input); - xmm3 = _mm_load_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm4 = _mm_unpackhi_ps(xmm0, xmm2); - xmm5 = _mm_unpackhi_ps(xmm1, xmm3); - xmm0 = _mm_unpacklo_ps(xmm0, xmm2); - xmm2 = _mm_unpacklo_ps(xmm1, xmm3); - - //imaginary vector from input - xmm1 = _mm_unpackhi_ps(xmm0, xmm4); - //real vector from input - xmm3 = _mm_unpacklo_ps(xmm0, xmm4); - //imaginary vector from taps - xmm0 = _mm_unpackhi_ps(xmm2, xmm5); - //real vector from taps - xmm2 = _mm_unpacklo_ps(xmm2, xmm5); - - xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); - xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); - - xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); - xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); - - real0 = _mm_add_ps(xmm4, real0); - real1 = _mm_add_ps(xmm5, real1); - im0 = _mm_add_ps(xmm6, im0); - im1 = _mm_add_ps(xmm7, im1); - - } - - - - - real1 = _mm_xor_ps(real1, (__m128)neg); - - - im0 = _mm_add_ps(im0, im1); - real0 = _mm_add_ps(real0, real1); - - im0 = _mm_add_ps(im0, real0); - - _mm_storel_pi(p_result, im0); - - for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { - - *result += input[i] * taps[i]; - } - */ -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h deleted file mode 100644 index 59259882c..000000000 --- a/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H -#define INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H - -#include -#include -#include -#include - -#if LV_HAVE_SSE3 -#include - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_32fc_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_store_ps((float*)c,z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * (*b); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_32fc_32fc_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -extern void volk_32fc_32fc_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points); -static inline void volk_32fc_32fc_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - static const float mask = -0.0; - volk_32fc_32fc_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, mask, num_points); -} -#endif /* LV_HAVE_ORC */ - - - - - -#endif /* INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h deleted file mode 100644 index 14f511697..000000000 --- a/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H -#define INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H - -#include -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -#include - -static inline void volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { - - - __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; - - lv_32fc_t diff; - memset(&diff, 0x0, 2*sizeof(float)); - - float sq_dist = 0.0; - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - - - xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); - xmm2 = _mm_load_ps((float*)&points[0]); - xmm8 = _mm_load1_ps(&scalar); - xmm1 = _mm_movelh_ps(xmm1, xmm1); - xmm3 = _mm_load_ps((float*)&points[2]); - - - for(; i < bound - 1; ++i) { - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm3 = _mm_load_ps((float*)&points[2]); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_store_ps(target, xmm4); - - target += 4; - - } - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - - - - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_store_ps(target, xmm4); - - target += 4; - - - for(i = 0; i < leftovers0; ++i) { - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_sub_ps(xmm1, xmm2); - - points += 2; - - xmm6 = _mm_mul_ps(xmm4, xmm4); - - xmm4 = _mm_hadd_ps(xmm6, xmm6); - - xmm4 = _mm_mul_ps(xmm4, xmm8); - - _mm_storeh_pi((__m64*)target, xmm4); - - target += 2; - } - - for(i = 0; i < leftovers1; ++i) { - - diff = src0[0] - points[0]; - - sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - - target[0] = sq_dist; - } -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_GENERIC -static inline void volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { - lv_32fc_t diff; - float sq_dist; - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - diff = src0[0] - points[i]; - - sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); - - target[i] = sq_dist; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h deleted file mode 100644 index b6c72adbf..000000000 --- a/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H -#define INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -#include - -static inline void volk_32fc_32fc_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { - - - __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - lv_32fc_t diff; - float sq_dist; - int bound = num_bytes >> 5; - int leftovers0 = (num_bytes >> 4) & 1; - int leftovers1 = (num_bytes >> 3) & 1; - int i = 0; - - xmm1 = _mm_setzero_ps(); - xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); - xmm2 = _mm_load_ps((float*)&points[0]); - xmm1 = _mm_movelh_ps(xmm1, xmm1); - xmm3 = _mm_load_ps((float*)&points[2]); - - - for(; i < bound - 1; ++i) { - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - xmm3 = _mm_load_ps((float*)&points[2]); - - _mm_store_ps(target, xmm4); - - target += 4; - - } - - xmm4 = _mm_sub_ps(xmm1, xmm2); - xmm5 = _mm_sub_ps(xmm1, xmm3); - - - - points += 4; - xmm6 = _mm_mul_ps(xmm4, xmm4); - xmm7 = _mm_mul_ps(xmm5, xmm5); - - xmm4 = _mm_hadd_ps(xmm6, xmm7); - - _mm_store_ps(target, xmm4); - - target += 4; - - for(i = 0; i < leftovers0; ++i) { - - xmm2 = _mm_load_ps((float*)&points[0]); - - xmm4 = _mm_sub_ps(xmm1, xmm2); - - points += 2; - - xmm6 = _mm_mul_ps(xmm4, xmm4); - - xmm4 = _mm_hadd_ps(xmm6, xmm6); - - _mm_storeh_pi((__m64*)target, xmm4); - - target += 2; - } - - for(i = 0; i < leftovers1; ++i) { - - diff = src0[0] - points[0]; - - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - - target[0] = sq_dist; - } -} - -#endif /*LV_HAVE_SSE3*/ - -#if LV_HAVE_GENERIC -static inline void volk_32fc_32fc_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { - lv_32fc_t diff; - float sq_dist; - int i = 0; - - for(; i < num_bytes >> 3; ++i) { - diff = src0[0] - points[i]; - - sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); - - target[i] = sq_dist; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h deleted file mode 100644 index 3ee579c2e..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H -#define INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector into I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h new file mode 100644 index 000000000..84d2576ed --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H +#define INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector into I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h deleted file mode 100644 index 404defc36..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H -#define INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_64f_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - const unsigned int halfPoints = num_points / 2; - __m128 cplxValue, fVal; - __m128d dVal; - - for(;number < halfPoints; number++){ - - cplxValue = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i1i2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(iBufferPtr, dVal); - - // Arrange in q1q2q1q2 format - fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); - dVal = _mm_cvtps_pd(fVal); - _mm_store_pd(qBufferPtr, dVal); - - iBufferPtr += 2; - qBufferPtr += 2; - } - - number = halfPoints * 2; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - *qBufferPtr++ = *complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_64f_64f_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const float* complexVectorPtr = (float*)complexVector; - double* iBufferPtr = iBuffer; - double* qBufferPtr = qBuffer; - - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (double)*complexVectorPtr++; - *qBufferPtr++ = (double)*complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h new file mode 100644 index 000000000..34262a7af --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h @@ -0,0 +1,78 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H +#define INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + const unsigned int halfPoints = num_points / 2; + __m128 cplxValue, fVal; + __m128d dVal; + + for(;number < halfPoints; number++){ + + cplxValue = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i1i2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(iBufferPtr, dVal); + + // Arrange in q1q2q1q2 format + fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1)); + dVal = _mm_cvtps_pd(fVal); + _mm_store_pd(qBufferPtr, dVal); + + iBufferPtr += 2; + qBufferPtr += 2; + } + + number = halfPoints * 2; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + *qBufferPtr++ = *complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_64f_x2_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const float* complexVectorPtr = (float*)complexVector; + double* iBufferPtr = iBuffer; + double* qBufferPtr = qBuffer; + + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (double)*complexVectorPtr++; + *qBufferPtr++ = (double)*complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h new file mode 100644 index 000000000..6042e6d62 --- /dev/null +++ b/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h @@ -0,0 +1,80 @@ +#ifndef INCLUDED_volk_32fc_deinterleave_real_16i_a16_H +#define INCLUDED_volk_32fc_deinterleave_real_16i_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + iValue = _mm_mul_ps(iValue, vScalar); + + _mm_store_ps(floatBuffer, iValue); + *iBufferPtr++ = (int16_t)(floatBuffer[0]); + *iBufferPtr++ = (int16_t)(floatBuffer[1]); + *iBufferPtr++ = (int16_t)(floatBuffer[2]); + *iBufferPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + iBufferPtr = &iBuffer[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h deleted file mode 100644 index 53235e5f7..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_real_16s_a16_H -#define INCLUDED_volk_32fc_deinterleave_real_16s_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_16s_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, iValue; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - - iValue = _mm_mul_ps(iValue, vScalar); - - _mm_store_ps(floatBuffer, iValue); - *iBufferPtr++ = (int16_t)(floatBuffer[0]); - *iBufferPtr++ = (int16_t)(floatBuffer[1]); - *iBufferPtr++ = (int16_t)(floatBuffer[2]); - *iBufferPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - iBufferPtr = &iBuffer[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_real_16s_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h new file mode 100644 index 000000000..530359600 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h @@ -0,0 +1,158 @@ +#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H +#define INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, result; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (const float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + result = _mm_sqrt_ps(result); + + result = _mm_mul_ps(result, vScalar); + + _mm_store_ps(floatBuffer, result); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); + *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + magnitudeVectorPtr = &magnitudeVector[number]; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +static inline void volk_32fc_s32f_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param scalar The scale value multiplied to the magnitude of each complex vector + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector +*/ +extern void volk_32fc_s32f_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); +static inline void volk_32fc_s32f_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + volk_32fc_s32f_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h deleted file mode 100644 index dc3c6741a..000000000 --- a/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H -#define INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_s32f_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, result; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - result = _mm_mul_ps(result, vScalar); - - _mm_store_ps(floatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); - } -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_s32f_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (const float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - result = _mm_sqrt_ps(result); - - result = _mm_mul_ps(result, vScalar); - - _mm_store_ps(floatBuffer, result); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]); - *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - magnitudeVectorPtr = &magnitudeVector[number]; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -static inline void volk_32fc_s32f_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - int16_t* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param scalar The scale value multiplied to the magnitude of each complex vector - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector -*/ -extern void volk_32fc_s32f_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points); -static inline void volk_32fc_s32f_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - volk_32fc_s32f_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h deleted file mode 100644 index 29ccdaef7..000000000 --- a/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h +++ /dev/null @@ -1,134 +0,0 @@ -#ifndef INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H -#define INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE3 -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Calculates the log10 power value divided by the RBW for each input point - \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param rbw The resolution bandwith of the fft spectrum - \param num_points The number of fft data points -*/ -static inline void volk_32fc_s32f_s32f_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ - const float* inputPtr = (const float*)complexFFTInput; - float* destPtr = logPowerOutput; - uint64_t number = 0; - const float iRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; - -#if LV_HAVE_LIB_SIMDMATH - __m128 magScalar = _mm_set_ps1(10.0); - magScalar = _mm_div_ps(magScalar, logf4(magScalar)); - - __m128 invRBW = _mm_set_ps1(iRBW); - - __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); - - __m128 power; - __m128 input1, input2; - const uint64_t quarterPoints = num_points / 4; - for(;number < quarterPoints; number++){ - // Load the complex values - input1 =_mm_load_ps(inputPtr); - inputPtr += 4; - input2 =_mm_load_ps(inputPtr); - inputPtr += 4; - - // Apply the normalization factor - input1 = _mm_mul_ps(input1, invNormalizationFactor); - input2 = _mm_mul_ps(input2, invNormalizationFactor); - - // Multiply each value by itself - // (r1*r1), (i1*i1), (r2*r2), (i2*i2) - input1 = _mm_mul_ps(input1, input1); - // (r3*r3), (i3*i3), (r4*r4), (i4*i4) - input2 = _mm_mul_ps(input2, input2); - - // Horizontal add, to add (r*r) + (i*i) for each complex value - // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) - power = _mm_hadd_ps(input1, input2); - - // Divide by the rbw - power = _mm_mul_ps(power, invRBW); - - // Calculate the natural log power - power = logf4(power); - - // Convert to log10 and multiply by 10.0 - power = _mm_mul_ps(power, magScalar); - - // Store the floating point results - _mm_store_ps(destPtr, power); - - destPtr += 4; - } - - number = quarterPoints*4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - // Calculate the FFT for any remaining points - for(; number < num_points; number++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); - destPtr++; - } - -} -#endif /* LV_HAVE_SSE3 */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the log10 power value divided by the RBW for each input point - \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point - \param complexFFTInput The complex data output from the FFT point - \param normalizationFactor This value is divided against all the input values before the power is calculated - \param rbw The resolution bandwith of the fft spectrum - \param num_points The number of fft data points -*/ -static inline void volk_32fc_s32f_s32f_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ - // Calculate the Power of the complex point - const float* inputPtr = (float*)complexFFTInput; - float* realFFTDataPointsPtr = logPowerOutput; - unsigned int point; - const float invRBW = 1.0 / rbw; - const float iNormalizationFactor = 1.0 / normalizationFactor; - - for(point = 0; point < num_points; point++){ - // Calculate dBm - // 50 ohm load assumption - // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) - // 75 ohm load assumption - // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) - - const float real = *inputPtr++ * iNormalizationFactor; - const float imag = *inputPtr++ * iNormalizationFactor; - - *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); - - realFFTDataPointsPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h new file mode 100644 index 000000000..0120b5307 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h @@ -0,0 +1,134 @@ +#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H +#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + const float* inputPtr = (const float*)complexFFTInput; + float* destPtr = logPowerOutput; + uint64_t number = 0; + const float iRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + +#if LV_HAVE_LIB_SIMDMATH + __m128 magScalar = _mm_set_ps1(10.0); + magScalar = _mm_div_ps(magScalar, logf4(magScalar)); + + __m128 invRBW = _mm_set_ps1(iRBW); + + __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor); + + __m128 power; + __m128 input1, input2; + const uint64_t quarterPoints = num_points / 4; + for(;number < quarterPoints; number++){ + // Load the complex values + input1 =_mm_load_ps(inputPtr); + inputPtr += 4; + input2 =_mm_load_ps(inputPtr); + inputPtr += 4; + + // Apply the normalization factor + input1 = _mm_mul_ps(input1, invNormalizationFactor); + input2 = _mm_mul_ps(input2, invNormalizationFactor); + + // Multiply each value by itself + // (r1*r1), (i1*i1), (r2*r2), (i2*i2) + input1 = _mm_mul_ps(input1, input1); + // (r3*r3), (i3*i3), (r4*r4), (i4*i4) + input2 = _mm_mul_ps(input2, input2); + + // Horizontal add, to add (r*r) + (i*i) for each complex value + // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4) + power = _mm_hadd_ps(input1, input2); + + // Divide by the rbw + power = _mm_mul_ps(power, invRBW); + + // Calculate the natural log power + power = logf4(power); + + // Convert to log10 and multiply by 10.0 + power = _mm_mul_ps(power, magScalar); + + // Store the floating point results + _mm_store_ps(destPtr, power); + + destPtr += 4; + } + + number = quarterPoints*4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + // Calculate the FFT for any remaining points + for(; number < num_points; number++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW); + destPtr++; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the log10 power value divided by the RBW for each input point + \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point + \param complexFFTInput The complex data output from the FFT point + \param normalizationFactor This value is divided against all the input values before the power is calculated + \param rbw The resolution bandwith of the fft spectrum + \param num_points The number of fft data points +*/ +static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){ + // Calculate the Power of the complex point + const float* inputPtr = (float*)complexFFTInput; + float* realFFTDataPointsPtr = logPowerOutput; + unsigned int point; + const float invRBW = 1.0 / rbw; + const float iNormalizationFactor = 1.0 / normalizationFactor; + + for(point = 0; point < num_points; point++){ + // Calculate dBm + // 50 ohm load assumption + // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10) + // 75 ohm load assumption + // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15) + + const float real = *inputPtr++ * iNormalizationFactor; + const float imag = *inputPtr++ * iNormalizationFactor; + + *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW); + + realFFTDataPointsPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H */ diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h new file mode 100644 index 000000000..a01971df3 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h @@ -0,0 +1,344 @@ +#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H +#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H + +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + + } + /* + for(i = 0; i < num_bytes >> 3; ++i) { + *result += input[i] * conjf(taps[i]); + } + */ +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + + + + asm volatile + ( + "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %[conjugator], %%r9\n\t" + " movq %%rcx, %%rax\n\t" + " movaps 0(%%r9), %%xmm8\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movups 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " xorps %%xmm8, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " xorps %%xmm8, %%xmm3\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + " xorps %%xmm8, %%xmm2\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1])); + + } + + return; +} +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + int bound = num_bytes >> 4; + int leftovers = num_bytes % 16; + + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " #movl 12(%%ebp), %%eax # input\n\t" + " #movl 16(%%ebp), %%edx # taps\n\t" + " #movl 20(%%ebp), %%ecx # n_bytes\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%[eax]), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%[edx]), %%xmm2\n\t" + " movl %[ecx], (%[out])\n\t" + " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t" + + " xorps %%xmm1, %%xmm2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%[eax]), %%xmmA\n\t" + "# movaps (%[edx]), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%[edx]), %%xmm3\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " xorps %%xmm1, %%xmm3\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " movaps 16(%[eax]), %%xmm1\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " movaps 0(%[conjugator]), %%xmm1\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%[eax]), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %[eax]\n\t" + " movaps 32(%[edx]), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " xorps %%xmm1, %%xmm2\n\t" + " addl $32, %[edx]\n\t" + ".%=L1_test:\n\t" + " decl %[ecx]\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t" + " shrl $4, %[ecx]\n\t" + " andl $1, %[ecx]\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " #movl 8(%%ebp), %[eax] \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%[out])\n\t" + " movss (%[out]), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %[eax] # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator) + ); + + + + + printf("%d, %d\n", leftovers, bound); + + for(; leftovers > 0; leftovers -= 8) { + + + *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)])); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + + + +#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h new file mode 100644 index 000000000..9a7b65ab4 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h @@ -0,0 +1,468 @@ +#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H +#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H + +#include +#include +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]; + + } + +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + asm + ( + "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %%rcx, %%rax\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) + :"rax", "r8", "r9", "r10" + ); + + + int getem = num_bytes % 16; + + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " movl 12(%%ebp), %%eax # input\n\t" + " movl 16(%%ebp), %%edx # taps\n\t" + " movl 20(%%ebp), %%ecx # n_bytes\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%eax), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%edx), %%xmm2\n\t" + " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%eax), %%xmmA\n\t" + "# movaps (%%edx), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%eax), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%edx), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%eax), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %%eax\n\t" + " movaps 32(%%edx), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " addl $32, %%edx\n\t" + ".%=L1_test:\n\t" + " decl %%ecx\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" + " shrl $4, %%ecx\n\t" + " andl $1, %%ecx\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " movl 8(%%ebp), %%eax \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%%eax)\n\t" + " movss (%%eax), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %%eax # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : + : "eax", "ecx", "edx" + ); + + + int getem = num_bytes % 16; + + for(; getem > 0; getem -= 8) { + + + *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]); + + } + + return; + + + + + + +} + +#endif /*LV_HAVE_SSE*/ + +#if LV_HAVE_SSE3 + +#include + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(float)); + + unsigned int number = 0; + const unsigned int halfPoints = num_bytes >> 4; + + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; + + dotProdVal = _mm_setzero_ps(); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + + a += 2; + b += 2; + } + + lv_32fc_t dotProductVector[2] __attribute__((aligned(16))); + + _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + + if((num_bytes >> 2) != 0) { + dotProduct += (*a) * (*b); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_SSE4_1 + +#include + +static inline void volk_32fc_x2_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + volk_32fc_x2_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes); + // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now + /* + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; + float *p_input, *p_taps; + __m64 *p_result; + + p_result = (__m64*)result; + p_input = (float*)input; + p_taps = (float*)taps; + + static const __m128i neg = {0x000000000000000080000000}; + + int i = 0; + + int bound = (num_bytes >> 5); + int leftovers = (num_bytes & 24) >> 3; + + real0 = _mm_sub_ps(real0, real0); + real1 = _mm_sub_ps(real1, real1); + im0 = _mm_sub_ps(im0, im0); + im1 = _mm_sub_ps(im1, im1); + + for(; i < bound; ++i) { + + + xmm0 = _mm_load_ps(p_input); + xmm1 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm2 = _mm_load_ps(p_input); + xmm3 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm4 = _mm_unpackhi_ps(xmm0, xmm2); + xmm5 = _mm_unpackhi_ps(xmm1, xmm3); + xmm0 = _mm_unpacklo_ps(xmm0, xmm2); + xmm2 = _mm_unpacklo_ps(xmm1, xmm3); + + //imaginary vector from input + xmm1 = _mm_unpackhi_ps(xmm0, xmm4); + //real vector from input + xmm3 = _mm_unpacklo_ps(xmm0, xmm4); + //imaginary vector from taps + xmm0 = _mm_unpackhi_ps(xmm2, xmm5); + //real vector from taps + xmm2 = _mm_unpacklo_ps(xmm2, xmm5); + + xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); + xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); + + xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); + xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); + + real0 = _mm_add_ps(xmm4, real0); + real1 = _mm_add_ps(xmm5, real1); + im0 = _mm_add_ps(xmm6, im0); + im1 = _mm_add_ps(xmm7, im1); + + } + + + + + real1 = _mm_xor_ps(real1, (__m128)neg); + + + im0 = _mm_add_ps(im0, im1); + real0 = _mm_add_ps(real0, real1); + + im0 = _mm_add_ps(im0, real0); + + _mm_storel_pi(p_result, im0); + + for(i = bound * 4; i < (bound * 4) + leftovers; ++i) { + + *result += input[i] * taps[i]; + } + */ +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h new file mode 100644 index 000000000..224ab19c8 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h @@ -0,0 +1,95 @@ +#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a16_H +#define INCLUDED_volk_32fc_x2_multiply_32fc_a16_H + +#include +#include +#include +#include + +#if LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points); +static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + static const float mask = -0.0; + volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, mask, num_points); +} +#endif /* LV_HAVE_ORC */ + + + + + +#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h new file mode 100644 index 000000000..6a863b16d --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h @@ -0,0 +1,126 @@ +#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H +#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H + +#include +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + + lv_32fc_t diff; + memset(&diff, 0x0, 2*sizeof(float)); + + float sq_dist = 0.0; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm8 = _mm_load1_ps(&scalar); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_store_ps(target, xmm4); + + target += 4; + + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + xmm4 = _mm_mul_ps(xmm4, xmm8); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H*/ diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h new file mode 100644 index 000000000..406097fc8 --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h @@ -0,0 +1,112 @@ +#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a16_H +#define INCLUDED_volk_32fc_x2_square_dist_32f_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE3 +#include +#include + +static inline void volk_32fc_x2_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + + + __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + lv_32fc_t diff; + float sq_dist; + int bound = num_bytes >> 5; + int leftovers0 = (num_bytes >> 4) & 1; + int leftovers1 = (num_bytes >> 3) & 1; + int i = 0; + + xmm1 = _mm_setzero_ps(); + xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); + xmm2 = _mm_load_ps((float*)&points[0]); + xmm1 = _mm_movelh_ps(xmm1, xmm1); + xmm3 = _mm_load_ps((float*)&points[2]); + + + for(; i < bound - 1; ++i) { + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + xmm3 = _mm_load_ps((float*)&points[2]); + + _mm_store_ps(target, xmm4); + + target += 4; + + } + + xmm4 = _mm_sub_ps(xmm1, xmm2); + xmm5 = _mm_sub_ps(xmm1, xmm3); + + + + points += 4; + xmm6 = _mm_mul_ps(xmm4, xmm4); + xmm7 = _mm_mul_ps(xmm5, xmm5); + + xmm4 = _mm_hadd_ps(xmm6, xmm7); + + _mm_store_ps(target, xmm4); + + target += 4; + + for(i = 0; i < leftovers0; ++i) { + + xmm2 = _mm_load_ps((float*)&points[0]); + + xmm4 = _mm_sub_ps(xmm1, xmm2); + + points += 2; + + xmm6 = _mm_mul_ps(xmm4, xmm4); + + xmm4 = _mm_hadd_ps(xmm6, xmm6); + + _mm_storeh_pi((__m64*)target, xmm4); + + target += 2; + } + + for(i = 0; i < leftovers1; ++i) { + + diff = src0[0] - points[0]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[0] = sq_dist; + } +} + +#endif /*LV_HAVE_SSE3*/ + +#if LV_HAVE_GENERIC +static inline void volk_32fc_x2_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) { + lv_32fc_t diff; + float sq_dist; + int i = 0; + + for(; i < num_bytes >> 3; ++i) { + diff = src0[0] - points[i]; + + sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); + + target[i] = sq_dist; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a16_H*/ diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a16.h b/volk/include/volk/volk_32i_s32f_convert_32f_a16.h new file mode 100644 index 000000000..0fcadd9cb --- /dev/null +++ b/volk/include/volk/volk_32i_s32f_convert_32f_a16.h @@ -0,0 +1,73 @@ +#ifndef INCLUDED_volk_32i_s32f_convert_32f_a16_H +#define INCLUDED_volk_32i_s32f_convert_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32i_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_load_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_store_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_32i_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32i_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_u.h b/volk/include/volk/volk_32i_s32f_convert_32f_u.h new file mode 100644 index 000000000..1dd6422f8 --- /dev/null +++ b/volk/include/volk/volk_32i_s32f_convert_32f_u.h @@ -0,0 +1,75 @@ +#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H +#define INCLUDED_volk_32i_s32f_convert_32f_u_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int32_t* inputPtr = (int32_t*)inputVector; + __m128i inputVal; + __m128 ret; + + for(;number < quarterPoints; number++){ + + // Load the 4 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + inputPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) * iScalar; + } +} +#endif /* LV_HAVE_SSE2 */ + + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 32 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_32i_s32f_convert_32f_u_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int32_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */ diff --git a/volk/include/volk/volk_32i_x2_and_32i_a16.h b/volk/include/volk/volk_32i_x2_and_32i_a16.h new file mode 100644 index 000000000..3baa1d856 --- /dev/null +++ b/volk/include/volk/volk_32i_x2_and_32i_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32i_x2_and_32i_a16_H +#define INCLUDED_volk_32i_x2_and_32i_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32i_x2_and_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_and_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] & bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +static inline void volk_32i_x2_and_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) & (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Ands the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors + \param bVector One of the vectors + \param num_points The number of values in aVector and bVector to be anded together and stored into cVector +*/ +extern void volk_32i_x2_and_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32i_x2_and_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32i_x2_and_32i_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32i_x2_and_32i_a16_H */ diff --git a/volk/include/volk/volk_32i_x2_or_32i_a16.h b/volk/include/volk/volk_32i_x2_or_32i_a16.h new file mode 100644 index 000000000..0be22f00a --- /dev/null +++ b/volk/include/volk/volk_32i_x2_or_32i_a16.h @@ -0,0 +1,81 @@ +#ifndef INCLUDED_volk_32i_x2_or_32i_a16_H +#define INCLUDED_volk_32i_x2_or_32i_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32i_x2_or_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = (float*)cVector; + const float* aPtr = (float*)aVector; + const float* bPtr = (float*)bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_or_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + cVector[number] = aVector[number] | bVector[number]; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +static inline void volk_32i_x2_or_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + int32_t* cPtr = cVector; + const int32_t* aPtr = aVector; + const int32_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) | (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC +/*! + \brief Ors the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be ored + \param bVector One of the vectors to be ored + \param num_points The number of values in aVector and bVector to be ored together and stored into cVector +*/ +extern void volk_32i_x2_or_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); +static inline void volk_32i_x2_or_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ + volk_32i_x2_or_32i_a16_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_32i_x2_or_32i_a16_H */ diff --git a/volk/include/volk/volk_32s_32s_and_32s_a16.h b/volk/include/volk/volk_32s_32s_and_32s_a16.h deleted file mode 100644 index 0e8380757..000000000 --- a/volk/include/volk/volk_32s_32s_and_32s_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32s_32s_and_32s_a16_H -#define INCLUDED_volk_32s_32s_and_32s_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -static inline void volk_32s_32s_and_32s_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = (float*)cVector; - const float* aPtr = (float*)aVector; - const float* bPtr = (float*)bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_and_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - cVector[number] = aVector[number] & bVector[number]; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -static inline void volk_32s_32s_and_32s_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) & (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Ands the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors - \param bVector One of the vectors - \param num_points The number of values in aVector and bVector to be anded together and stored into cVector -*/ -extern void volk_32s_32s_and_32s_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32s_32s_and_32s_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32s_32s_and_32s_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32s_32s_and_32s_a16_H */ diff --git a/volk/include/volk/volk_32s_32s_or_32s_a16.h b/volk/include/volk/volk_32s_32s_or_32s_a16.h deleted file mode 100644 index 2dcf2e551..000000000 --- a/volk/include/volk/volk_32s_32s_or_32s_a16.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef INCLUDED_volk_32s_32s_or_32s_a16_H -#define INCLUDED_volk_32s_32s_or_32s_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -static inline void volk_32s_32s_or_32s_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = (float*)cVector; - const float* aPtr = (float*)aVector; - const float* bPtr = (float*)bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_or_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - cVector[number] = aVector[number] | bVector[number]; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -static inline void volk_32s_32s_or_32s_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - int32_t* cPtr = cVector; - const int32_t* aPtr = aVector; - const int32_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) | (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC -/*! - \brief Ors the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be ored - \param bVector One of the vectors to be ored - \param num_points The number of values in aVector and bVector to be ored together and stored into cVector -*/ -extern void volk_32s_32s_or_32s_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points); -static inline void volk_32s_32s_or_32s_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){ - volk_32s_32s_or_32s_a16_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_32s_32s_or_32s_a16_H */ diff --git a/volk/include/volk/volk_32s_s32f_convert_32f_a16.h b/volk/include/volk/volk_32s_s32f_convert_32f_a16.h deleted file mode 100644 index c16ecc9dd..000000000 --- a/volk/include/volk/volk_32s_s32f_convert_32f_a16.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDED_volk_32s_s32f_convert_32f_a16_H -#define INCLUDED_volk_32s_s32f_convert_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32s_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; - - for(;number < quarterPoints; number++){ - - // Load the 4 values - inputVal = _mm_load_si128((__m128i*)inputPtr); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - - _mm_store_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - inputPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } -} -#endif /* LV_HAVE_SSE2 */ - - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_32s_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32s_s32f_convert_32f_a16_H */ diff --git a/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h deleted file mode 100644 index 4eb5a5b85..000000000 --- a/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef INCLUDED_volk_32s_s32f_convert_32f_ua16_H -#define INCLUDED_volk_32s_s32f_convert_32f_ua16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_32s_s32f_convert_32f_ua16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int32_t* inputPtr = (int32_t*)inputVector; - __m128i inputVal; - __m128 ret; - - for(;number < quarterPoints; number++){ - - // Load the 4 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - inputPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) * iScalar; - } -} -#endif /* LV_HAVE_SSE2 */ - - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 32 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_32s_s32f_convert_32f_ua16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int32_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32s_s32f_convert_32f_ua16_H */ diff --git a/volk/include/volk/volk_64f_64f_max_64f_a16.h b/volk/include/volk/volk_64f_64f_max_64f_a16.h deleted file mode 100644 index 7e091851f..000000000 --- a/volk/include/volk/volk_64f_64f_max_64f_a16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_volk_64f_64f_max_64f_a16_H -#define INCLUDED_volk_64f_64f_max_64f_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_64f_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - - __m128d aVal, bVal, cVal; - for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); - - cVal = _mm_max_pd(aVal, bVal); - - _mm_store_pd(cPtr,cVal); // Store the results back into the C container - - aPtr += 2; - bPtr += 2; - cPtr += 2; - } - - number = halfPoints * 2; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_64f_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a > b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_64f_64f_max_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_64f_min_64f_a16.h b/volk/include/volk/volk_64f_64f_min_64f_a16.h deleted file mode 100644 index f2bcbe83b..000000000 --- a/volk/include/volk/volk_64f_64f_min_64f_a16.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef INCLUDED_volk_64f_64f_min_64f_a16_H -#define INCLUDED_volk_64f_64f_min_64f_a16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_64f_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - - __m128d aVal, bVal, cVal; - for(;number < halfPoints; number++){ - - aVal = _mm_load_pd(aPtr); - bVal = _mm_load_pd(bPtr); - - cVal = _mm_min_pd(aVal, bVal); - - _mm_store_pd(cPtr,cVal); // Store the results back into the C container - - aPtr += 2; - bPtr += 2; - cPtr += 2; - } - - number = halfPoints * 2; - for(;number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_SSE2 */ - -#if LV_HAVE_GENERIC -/*! - \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector - \param cVector The vector where the results will be stored - \param aVector The vector to be checked - \param bVector The vector to be checked - \param num_points The number of values in aVector and bVector to be checked and stored into cVector -*/ -static inline void volk_64f_64f_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ - double* cPtr = cVector; - const double* aPtr = aVector; - const double* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - const double a = *aPtr++; - const double b = *bPtr++; - *cPtr++ = ( a < b ? a : b); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_64f_64f_min_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_u.h b/volk/include/volk/volk_64f_convert_32f_u.h new file mode 100644 index 000000000..6338c1433 --- /dev/null +++ b/volk/include/volk/volk_64f_convert_32f_u.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_volk_64f_convert_32f_u_H +#define INCLUDED_volk_64f_convert_32f_u_H + +#include +#include + +#if LV_HAVE_SSE2 +#include + /*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted + */ +static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const double* inputVectorPtr = (const double*)inputVector; + float* outputVectorPtr = outputVector; + __m128 ret, ret2; + __m128d inputVal1, inputVal2; + + for(;number < quarterPoints; number++){ + inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; + + ret = _mm_cvtpd_ps(inputVal1); + ret2 = _mm_cvtpd_ps(inputVal2); + + ret = _mm_movelh_ps(ret, ret2); + + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]); + } +} +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts the double values into float values + \param dVector The converted float vector values + \param fVector The double vector values to be converted + \param num_points The number of points in the two vectors to be converted +*/ +static inline void volk_64f_convert_32f_u_generic(float* outputVector, const double* inputVector, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const double* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_64f_convert_32f_u_H */ diff --git a/volk/include/volk/volk_64f_convert_32f_ua16.h b/volk/include/volk/volk_64f_convert_32f_ua16.h deleted file mode 100644 index 7774db1b7..000000000 --- a/volk/include/volk/volk_64f_convert_32f_ua16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_volk_64f_convert_32f_ua16_H -#define INCLUDED_volk_64f_convert_32f_ua16_H - -#include -#include - -#if LV_HAVE_SSE2 -#include - /*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted - */ -static inline void volk_64f_convert_32f_ua16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const double* inputVectorPtr = (const double*)inputVector; - float* outputVectorPtr = outputVector; - __m128 ret, ret2; - __m128d inputVal1, inputVal2; - - for(;number < quarterPoints; number++){ - inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2; - - ret = _mm_cvtpd_ps(inputVal1); - ret2 = _mm_cvtpd_ps(inputVal2); - - ret = _mm_movelh_ps(ret, ret2); - - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]); - } -} -#endif /* LV_HAVE_SSE2 */ - - -#ifdef LV_HAVE_GENERIC -/*! - \brief Converts the double values into float values - \param dVector The converted float vector values - \param fVector The double vector values to be converted - \param num_points The number of points in the two vectors to be converted -*/ -static inline void volk_64f_convert_32f_ua16_generic(float* outputVector, const double* inputVector, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const double* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_64f_convert_32f_ua16_H */ diff --git a/volk/include/volk/volk_64f_x2_max_64f_a16.h b/volk/include/volk/volk_64f_x2_max_64f_a16.h new file mode 100644 index 000000000..4b0c1f5f1 --- /dev/null +++ b/volk/include/volk/volk_64f_x2_max_64f_a16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_64f_x2_max_64f_a16_H +#define INCLUDED_volk_64f_x2_max_64f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_max_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a > b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_64f_x2_max_64f_a16_H */ diff --git a/volk/include/volk/volk_64f_x2_min_64f_a16.h b/volk/include/volk/volk_64f_x2_min_64f_a16.h new file mode 100644 index 000000000..aa961e384 --- /dev/null +++ b/volk/include/volk/volk_64f_x2_min_64f_a16.h @@ -0,0 +1,71 @@ +#ifndef INCLUDED_volk_64f_x2_min_64f_a16_H +#define INCLUDED_volk_64f_x2_min_64f_a16_H + +#include +#include + +#if LV_HAVE_SSE2 +#include +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + + __m128d aVal, bVal, cVal; + for(;number < halfPoints; number++){ + + aVal = _mm_load_pd(aPtr); + bVal = _mm_load_pd(bPtr); + + cVal = _mm_min_pd(aVal, bVal); + + _mm_store_pd(cPtr,cVal); // Store the results back into the C container + + aPtr += 2; + bPtr += 2; + cPtr += 2; + } + + number = halfPoints * 2; + for(;number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_SSE2 */ + +#if LV_HAVE_GENERIC +/*! + \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector + \param cVector The vector where the results will be stored + \param aVector The vector to be checked + \param bVector The vector to be checked + \param num_points The number of values in aVector and bVector to be checked and stored into cVector +*/ +static inline void volk_64f_x2_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){ + double* cPtr = cVector; + const double* aPtr = aVector; + const double* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + const double a = *aPtr++; + const double b = *bPtr++; + *cPtr++ = ( a < b ? a : b); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_64f_x2_min_64f_a16_H */ diff --git a/volk/include/volk/volk_8i_convert_16i_a16.h b/volk/include/volk/volk_8i_convert_16i_a16.h new file mode 100644 index 000000000..3d7045753 --- /dev/null +++ b/volk/include/volk/volk_8i_convert_16i_a16.h @@ -0,0 +1,83 @@ +#ifndef INCLUDED_volk_8i_convert_16i_a16_H +#define INCLUDED_volk_8i_convert_16i_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_convert_16i_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_store_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_convert_16i_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +extern void volk_8i_convert_16i_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); +static inline void volk_8i_convert_16i_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + volk_8i_convert_16i_a16_orc_impl(outputVector, inputVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8i_convert_16i_u.h b/volk/include/volk/volk_8i_convert_16i_u.h new file mode 100644 index 000000000..bcff13406 --- /dev/null +++ b/volk/include/volk/volk_8i_convert_16i_u.h @@ -0,0 +1,73 @@ +#ifndef INCLUDED_volk_8i_convert_16i_u_H +#define INCLUDED_volk_8i_convert_16i_u_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const __m128i* inputVectorPtr = (const __m128i*)inputVector; + __m128i* outputVectorPtr = (__m128i*)outputVector; + __m128i inputVal; + __m128i ret; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128(inputVectorPtr); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVal = _mm_srli_si128(inputVal, 8); + ret = _mm_cvtepi8_epi16(inputVal); + ret = _mm_slli_epi16(ret, 8); // Multiply by 256 + _mm_storeu_si128(outputVectorPtr, ret); + + outputVectorPtr++; + + inputVectorPtr++; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (int16_t)(inputVector[number])*256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into 16 bit integer data + \param inputVector The 8 bit input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + \note Input and output buffers do NOT need to be properly aligned + */ +static inline void volk_8i_convert_16i_u_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h new file mode 100644 index 000000000..d5c8eeb51 --- /dev/null +++ b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h @@ -0,0 +1,105 @@ +#ifndef INCLUDED_volk_8i_s32f_convert_32f_a16_H +#define INCLUDED_volk_8i_s32f_convert_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_load_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_store_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + +#if LV_HAVE_ORC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +extern void volk_8i_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); +static inline void volk_8i_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, scalar, num_points); +} +#endif /* LV_HAVE_ORC */ + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_u.h b/volk/include/volk/volk_8i_s32f_convert_32f_u.h new file mode 100644 index 000000000..1e30957e8 --- /dev/null +++ b/volk/include/volk/volk_8i_s32f_convert_32f_u.h @@ -0,0 +1,94 @@ +#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H +#define INCLUDED_volk_8i_s32f_convert_32f_u_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int sixteenthPoints = num_points / 16; + + float* outputVectorPtr = outputVector; + const float iScalar = 1.0 / scalar; + __m128 invScalar = _mm_set_ps1( iScalar ); + const int8_t* inputVectorPtr = inputVector; + __m128 ret; + __m128i inputVal; + __m128i interimVal; + + for(;number < sixteenthPoints; number++){ + inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); + + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVal = _mm_srli_si128(inputVal, 4); + interimVal = _mm_cvtepi8_epi32(inputVal); + ret = _mm_cvtepi32_ps(interimVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + inputVectorPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) * iScalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC + /*! + \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 8 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_8i_s32f_convert_32f_u_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int8_t* inputVectorPtr = inputVector; + unsigned int number = 0; + const float iScalar = 1.0 / scalar; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h new file mode 100644 index 000000000..91c9b2c58 --- /dev/null +++ b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h @@ -0,0 +1,77 @@ +#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H +#define INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_16i_x2_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + __m128i complexVal, iOutputVal, qOutputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iOutputVal = _mm_cvtepi8_epi16(iOutputVal); + iOutputVal = _mm_slli_epi16(iOutputVal, 8); + + qOutputVal = _mm_cvtepi8_epi16(qOutputVal); + qOutputVal = _mm_slli_epi16(qOutputVal, 8); + + _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); + _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); + + iBufferPtr += 8; + qBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + int16_t* qBufferPtr = qBuffer; + unsigned int number; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h new file mode 100644 index 000000000..bf3dc20dd --- /dev/null +++ b/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h @@ -0,0 +1,66 @@ +#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a16_H +#define INCLUDED_volk_8ic_deinterleave_real_16i_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_16i_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i complexVal, outputVal; + + unsigned int eighthPoints = num_points / 8; + + for(number = 0; number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + outputVal = _mm_cvtepi8_epi16(complexVal); + outputVal = _mm_slli_epi16(outputVal, 7); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + int16_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h new file mode 100644 index 000000000..13de79423 --- /dev/null +++ b/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h @@ -0,0 +1,67 @@ +#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H +#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H + +#include +#include + +#if LV_HAVE_SSSE3 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + __m128i complexVal1, complexVal2, outputVal; + + unsigned int sixteenthPoints = num_points / 16; + + for(number = 0; number < sixteenthPoints; number++){ + complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + + complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); + complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); + + outputVal = _mm_or_si128(complexVal1, complexVal2); + + _mm_store_si128((__m128i*)iBufferPtr, outputVal); + iBufferPtr += 16; + } + + number = sixteenthPoints * 16; + for(; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSSE3 */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (int8_t*)complexVector; + int8_t* iBufferPtr = iBuffer; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = *complexVectorPtr++; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h new file mode 100644 index 000000000..22c3ebb23 --- /dev/null +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h @@ -0,0 +1,164 @@ +#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H +#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue, qFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); + qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + iComplexVal = _mm_srli_si128(iComplexVal, 4); + + iIntVal = _mm_cvtepi8_epi32(iComplexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + _mm_store_ps(iBufferPtr, iFloatValue); + iBufferPtr += 4; + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + qBufferPtr += 4; + + qComplexVal = _mm_srli_si128(qComplexVal, 4); + + qIntVal = _mm_cvtepi8_epi32(qComplexVal); + qFloatValue = _mm_cvtepi32_ps(qIntVal); + qFloatValue = _mm_mul_ps(qFloatValue, invScalar); + _mm_store_ps(qBufferPtr, qFloatValue); + + qBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 cplxValue1, cplxValue2, iValue, qValue; + + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + float floatBuffer[8] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(complexVectorPtr[0]); + floatBuffer[1] = (float)(complexVectorPtr[1]); + floatBuffer[2] = (float)(complexVectorPtr[2]); + floatBuffer[3] = (float)(complexVectorPtr[3]); + + floatBuffer[4] = (float)(complexVectorPtr[4]); + floatBuffer[5] = (float)(complexVectorPtr[5]); + floatBuffer[6] = (float)(complexVectorPtr[6]); + floatBuffer[7] = (float)(complexVectorPtr[7]); + + cplxValue1 = _mm_load_ps(&floatBuffer[0]); + cplxValue2 = _mm_load_ps(&floatBuffer[4]); + + complexVectorPtr += 8; + + cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); + cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + _mm_store_ps(iBufferPtr, iValue); + _mm_store_ps(qBufferPtr, qValue); + + iBufferPtr += 4; + qBufferPtr += 4; + } + + number = quarterPoints * 4; + complexVectorPtr = (int8_t*)&complexVector[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param qBuffer The Q buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + float* qBufferPtr = qBuffer; + unsigned int number; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H */ diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h new file mode 100644 index 000000000..5f1430394 --- /dev/null +++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h @@ -0,0 +1,133 @@ +#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H +#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + __m128 iFloatValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + __m128i complexVal, iIntVal; + int8_t* complexVectorPtr = (int8_t*)complexVector; + + __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + + for(;number < eighthPoints; number++){ + complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; + complexVal = _mm_shuffle_epi8(complexVal, moveMask); + + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + + complexVal = _mm_srli_si128(complexVal, 4); + iIntVal = _mm_cvtepi8_epi32(complexVal); + iFloatValue = _mm_cvtepi32_ps(iIntVal); + + iFloatValue = _mm_mul_ps(iFloatValue, invScalar); + + _mm_store_ps(iBufferPtr, iFloatValue); + + iBufferPtr += 4; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE4_1 */ + + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + float* iBufferPtr = iBuffer; + + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + __m128 iValue; + + const float iScalar= 1.0 / scalar; + __m128 invScalar = _mm_set_ps1(iScalar); + int8_t* complexVectorPtr = (int8_t*)complexVector; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; + floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; + + iValue = _mm_load_ps(floatBuffer); + + iValue = _mm_mul_ps(iValue, invScalar); + + _mm_store_ps(iBufferPtr, iValue); + + iBufferPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex 8 bit vector into I float vector data + \param complexVector The complex input vector + \param iBuffer The I buffer output data + \param scalar The scaling value being multiplied against each data point + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_8ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const int8_t* complexVectorPtr = (const int8_t*)complexVector; + float* iBufferPtr = iBuffer; + const float invScalar = 1.0 / scalar; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; + complexVectorPtr++; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h new file mode 100644 index 000000000..d9cacbf46 --- /dev/null +++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h @@ -0,0 +1,102 @@ +#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H +#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + lv_16sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + const int shuffleMask = _MM_SHUFFLE(2,3,0,1); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); + + a += 4; + b += 4; + c += 4; + } + + number = quarterPoints * 4; + int16_t* c16Ptr = (int16_t*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + int16_t* c16Ptr = (int16_t*)cVector; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number =0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *c16Ptr++ = (int16_t)lv_creal(temp); + *c16Ptr++ = (int16_t)lv_cimag(temp); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H */ diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h new file mode 100644 index 000000000..6ec923a4f --- /dev/null +++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h @@ -0,0 +1,122 @@ +#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H +#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H + +#include +#include +#include + +#if LV_HAVE_SSE4_1 +#include +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + __m128i x, y, realz, imagz; + __m128 ret; + lv_32fc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); + const int shuffleMask = _MM_SHUFFLE(2,3,0,1); + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + for(;number < quarterPoints; number++){ + // Convert into 8 bit values into 16 bit values + x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); + y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); + + // Calculate the ar*cr - ai*(-ci) portions + realz = _mm_madd_epi16(x,y); + + // Calculate the complex conjugate of the cr + ci j values + y = _mm_sign_epi16(y, conjugateSign); + + // Shift the order of the cr and ci values + y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); + + // Calculate the ar*(-ci) + cr*(ai) + imagz = _mm_madd_epi16(x,y); + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + // Interleave real and imaginary and then convert to float values + ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); + + // Normalize the floating point values + ret = _mm_mul_ps(ret, invScalar); + + // Store the floating point values + _mm_store_ps((float*)c, ret); + c += 2; + + a += 4; + b += 4; + } + + number = quarterPoints * 4; + float* cFloatPtr = (float*)&cVector[number]; + int8_t* a8Ptr = (int8_t*)&aVector[number]; + int8_t* b8Ptr = (int8_t*)&bVector[number]; + for(; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cFloatPtr++ = lv_creal(temp) / scalar; + *cFloatPtr++ = lv_cimag(temp) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#if LV_HAVE_GENERIC +/*! + \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector + \param cVector The complex vector where the results will be stored + \param aVector One of the complex vectors to be multiplied + \param bVector The complex vector which will be converted to complex conjugate and multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + float* cPtr = (float*)cVector; + const float invScalar = 1.0 / scalar; + int8_t* a8Ptr = (int8_t*)aVector; + int8_t* b8Ptr = (int8_t*)bVector; + for(number = 0; number < num_points; number++){ + float aReal = (float)*a8Ptr++; + float aImag = (float)*a8Ptr++; + lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); + float bReal = (float)*b8Ptr++; + float bImag = (float)*b8Ptr++; + lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); + lv_32fc_t temp = aVal * bVal; + + *cPtr++ = (lv_creal(temp) * invScalar); + *cPtr++ = (lv_cimag(temp) * invScalar); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H */ diff --git a/volk/include/volk/volk_8s_convert_16s_a16.h b/volk/include/volk/volk_8s_convert_16s_a16.h deleted file mode 100644 index 38efdb6a3..000000000 --- a/volk/include/volk/volk_8s_convert_16s_a16.h +++ /dev/null @@ -1,83 +0,0 @@ -#ifndef INCLUDED_volk_8s_convert_16s_a16_H -#define INCLUDED_volk_8s_convert_16s_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_convert_16s_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m128i* outputVectorPtr = (__m128i*)outputVector; - __m128i inputVal; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128(inputVectorPtr); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_store_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVal = _mm_srli_si128(inputVal, 8); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_store_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVectorPtr++; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_convert_16s_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - */ -extern void volk_8s_convert_16s_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points); -static inline void volk_8s_convert_16s_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - volk_8s_convert_16s_a16_orc_impl(outputVector, inputVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_convert_16s_ua16.h b/volk/include/volk/volk_8s_convert_16s_ua16.h deleted file mode 100644 index a726bfb5e..000000000 --- a/volk/include/volk/volk_8s_convert_16s_ua16.h +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef INCLUDED_volk_8s_convert_16s_ua16_H -#define INCLUDED_volk_8s_convert_16s_ua16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned - */ -static inline void volk_8s_convert_16s_ua16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - const __m128i* inputVectorPtr = (const __m128i*)inputVector; - __m128i* outputVectorPtr = (__m128i*)outputVector; - __m128i inputVal; - __m128i ret; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128(inputVectorPtr); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_storeu_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVal = _mm_srli_si128(inputVal, 8); - ret = _mm_cvtepi8_epi16(inputVal); - ret = _mm_slli_epi16(ret, 8); // Multiply by 256 - _mm_storeu_si128(outputVectorPtr, ret); - - outputVectorPtr++; - - inputVectorPtr++; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (int16_t)(inputVector[number])*256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into 16 bit integer data - \param inputVector The 8 bit input data buffer - \param outputVector The 16 bit output data buffer - \param num_points The number of data values to be converted - \note Input and output buffers do NOT need to be properly aligned - */ -static inline void volk_8s_convert_16s_ua16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_s32f_convert_32f_a16.h b/volk/include/volk/volk_8s_s32f_convert_32f_a16.h deleted file mode 100644 index 45185ac2e..000000000 --- a/volk/include/volk/volk_8s_s32f_convert_32f_a16.h +++ /dev/null @@ -1,105 +0,0 @@ -#ifndef INCLUDED_volk_8s_s32f_convert_32f_a16_H -#define INCLUDED_volk_8s_s32f_convert_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - const int8_t* inputVectorPtr = inputVector; - __m128 ret; - __m128i inputVal; - __m128i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_load_si128((__m128i*)inputVectorPtr); - - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_store_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_8s_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - -#if LV_HAVE_ORC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -extern void volk_8s_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); -static inline void volk_8s_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - volk_8s_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, scalar, num_points); -} -#endif /* LV_HAVE_ORC */ - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h deleted file mode 100644 index 310824580..000000000 --- a/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef INCLUDED_volk_8s_s32f_convert_32f_ua16_H -#define INCLUDED_volk_8s_s32f_convert_32f_ua16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_8s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int sixteenthPoints = num_points / 16; - - float* outputVectorPtr = outputVector; - const float iScalar = 1.0 / scalar; - __m128 invScalar = _mm_set_ps1( iScalar ); - const int8_t* inputVectorPtr = inputVector; - __m128 ret; - __m128i inputVal; - __m128i interimVal; - - for(;number < sixteenthPoints; number++){ - inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr); - - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVal = _mm_srli_si128(inputVal, 4); - interimVal = _mm_cvtepi8_epi32(inputVal); - ret = _mm_cvtepi32_ps(interimVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - inputVectorPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) * iScalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC - /*! - \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 8 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_8s_s32f_convert_32f_ua16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int8_t* inputVectorPtr = inputVector; - unsigned int number = 0; - const float iScalar = 1.0 / scalar; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */ diff --git a/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h b/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h deleted file mode 100644 index eae1185ec..000000000 --- a/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H -#define INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_8sc_multiply_conjugate_16sc_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128i x, y, realz, imagz; - lv_16sc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - const int shuffleMask = _MM_SHUFFLE(2,3,0,1); - - for(;number < quarterPoints; number++){ - // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm_madd_epi16(x,y); - - _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz))); - - a += 4; - b += 4; - c += 4; - } - - number = quarterPoints * 4; - int16_t* c16Ptr = (int16_t*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_8sc_multiply_conjugate_16sc_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - int16_t* c16Ptr = (int16_t*)cVector; - int8_t* a8Ptr = (int8_t*)aVector; - int8_t* b8Ptr = (int8_t*)bVector; - for(number =0; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *c16Ptr++ = (int16_t)lv_creal(temp); - *c16Ptr++ = (int16_t)lv_cimag(temp); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H */ diff --git a/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h deleted file mode 100644 index 621276b08..000000000 --- a/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H -#define INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H - -#include -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - __m128i x, y, realz, imagz; - __m128 ret; - lv_32fc_t* c = cVector; - const lv_8sc_t* a = aVector; - const lv_8sc_t* b = bVector; - __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1); - const int shuffleMask = _MM_SHUFFLE(2,3,0,1); - __m128 invScalar = _mm_set_ps1(1.0/scalar); - - for(;number < quarterPoints; number++){ - // Convert into 8 bit values into 16 bit values - x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a)); - y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b)); - - // Calculate the ar*cr - ai*(-ci) portions - realz = _mm_madd_epi16(x,y); - - // Calculate the complex conjugate of the cr + ci j values - y = _mm_sign_epi16(y, conjugateSign); - - // Shift the order of the cr and ci values - y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask); - - // Calculate the ar*(-ci) + cr*(ai) - imagz = _mm_madd_epi16(x,y); - - // Interleave real and imaginary and then convert to float values - ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz)); - - // Normalize the floating point values - ret = _mm_mul_ps(ret, invScalar); - - // Store the floating point values - _mm_store_ps((float*)c, ret); - c += 2; - - // Interleave real and imaginary and then convert to float values - ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz)); - - // Normalize the floating point values - ret = _mm_mul_ps(ret, invScalar); - - // Store the floating point values - _mm_store_ps((float*)c, ret); - c += 2; - - a += 4; - b += 4; - } - - number = quarterPoints * 4; - float* cFloatPtr = (float*)&cVector[number]; - int8_t* a8Ptr = (int8_t*)&aVector[number]; - int8_t* b8Ptr = (int8_t*)&bVector[number]; - for(; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cFloatPtr++ = lv_creal(temp) / scalar; - *cFloatPtr++ = lv_cimag(temp) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC -/*! - \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector - \param cVector The complex vector where the results will be stored - \param aVector One of the complex vectors to be multiplied - \param bVector The complex vector which will be converted to complex conjugate and multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - float* cPtr = (float*)cVector; - const float invScalar = 1.0 / scalar; - int8_t* a8Ptr = (int8_t*)aVector; - int8_t* b8Ptr = (int8_t*)bVector; - for(number = 0; number < num_points; number++){ - float aReal = (float)*a8Ptr++; - float aImag = (float)*a8Ptr++; - lv_32fc_t aVal = lv_32fc_init(aReal, aImag ); - float bReal = (float)*b8Ptr++; - float bImag = (float)*b8Ptr++; - lv_32fc_t bVal = lv_32fc_init( bReal, -bImag ); - lv_32fc_t temp = aVal * bVal; - - *cPtr++ = (lv_creal(temp) * invScalar); - *cPtr++ = (lv_cimag(temp) * invScalar); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h deleted file mode 100644 index 6a35e969d..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H -#define INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_16s_16s_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - __m128i complexVal, iOutputVal, qOutputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); - qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask); - - iOutputVal = _mm_cvtepi8_epi16(iOutputVal); - iOutputVal = _mm_slli_epi16(iOutputVal, 8); - - qOutputVal = _mm_cvtepi8_epi16(qOutputVal); - qOutputVal = _mm_slli_epi16(qOutputVal, 8); - - _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); - _mm_store_si128((__m128i*)qBufferPtr, qOutputVal); - - iBufferPtr += 8; - qBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - int16_t* qBufferPtr = qBuffer; - unsigned int number; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; - *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h deleted file mode 100644 index 67ffebd99..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef INCLUDED_volk_8sc_deinterleave_real_16s_a16_H -#define INCLUDED_volk_8sc_deinterleave_real_16s_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_16s_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i complexVal, outputVal; - - unsigned int eighthPoints = num_points / 8; - - for(number = 0; number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - outputVal = _mm_cvtepi8_epi16(complexVal); - outputVal = _mm_slli_epi16(outputVal, 7); - - _mm_store_si128((__m128i*)iBufferPtr, outputVal); - iBufferPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE4_1 */ - - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - int16_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8sc_deinterleave_real_16s_a16_H */ diff --git a/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h deleted file mode 100644 index ecffc092e..000000000 --- a/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H -#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H - -#include -#include - -#if LV_HAVE_SSSE3 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - __m128i complexVal1, complexVal2, outputVal; - - unsigned int sixteenthPoints = num_points / 16; - - for(number = 0; number < sixteenthPoints; number++){ - complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - - complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1); - complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2); - - outputVal = _mm_or_si128(complexVal1, complexVal2); - - _mm_store_si128((__m128i*)iBufferPtr, outputVal); - iBufferPtr += 16; - } - - number = sixteenthPoints * 16; - for(; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSSE3 */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (int8_t*)complexVector; - int8_t* iBufferPtr = iBuffer; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = *complexVectorPtr++; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */ diff --git a/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h deleted file mode 100644 index cedbf202c..000000000 --- a/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h +++ /dev/null @@ -1,164 +0,0 @@ -#ifndef INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H -#define INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - __m128 iFloatValue, qFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1); - - for(;number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask); - qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask); - - iIntVal = _mm_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - _mm_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 4; - - iComplexVal = _mm_srli_si128(iComplexVal, 4); - - iIntVal = _mm_cvtepi8_epi32(iComplexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - _mm_store_ps(iBufferPtr, iFloatValue); - iBufferPtr += 4; - - qIntVal = _mm_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm_cvtepi32_ps(qIntVal); - qFloatValue = _mm_mul_ps(qFloatValue, invScalar); - _mm_store_ps(qBufferPtr, qFloatValue); - qBufferPtr += 4; - - qComplexVal = _mm_srli_si128(qComplexVal, 4); - - qIntVal = _mm_cvtepi8_epi32(qComplexVal); - qFloatValue = _mm_cvtepi32_ps(qIntVal); - qFloatValue = _mm_mul_ps(qFloatValue, invScalar); - _mm_store_ps(qBufferPtr, qFloatValue); - - qBufferPtr += 4; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 cplxValue1, cplxValue2, iValue, qValue; - - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int8_t* complexVectorPtr = (int8_t*)complexVector; - - float floatBuffer[8] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(complexVectorPtr[0]); - floatBuffer[1] = (float)(complexVectorPtr[1]); - floatBuffer[2] = (float)(complexVectorPtr[2]); - floatBuffer[3] = (float)(complexVectorPtr[3]); - - floatBuffer[4] = (float)(complexVectorPtr[4]); - floatBuffer[5] = (float)(complexVectorPtr[5]); - floatBuffer[6] = (float)(complexVectorPtr[6]); - floatBuffer[7] = (float)(complexVectorPtr[7]); - - cplxValue1 = _mm_load_ps(&floatBuffer[0]); - cplxValue2 = _mm_load_ps(&floatBuffer[4]); - - complexVectorPtr += 8; - - cplxValue1 = _mm_mul_ps(cplxValue1, invScalar); - cplxValue2 = _mm_mul_ps(cplxValue2, invScalar); - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - _mm_store_ps(iBufferPtr, iValue); - _mm_store_ps(qBufferPtr, qValue); - - iBufferPtr += 4; - qBufferPtr += 4; - } - - number = quarterPoints * 4; - complexVectorPtr = (int8_t*)&complexVector[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param qBuffer The Q buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - float* iBufferPtr = iBuffer; - float* qBufferPtr = qBuffer; - unsigned int number; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; - *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H */ diff --git a/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h deleted file mode 100644 index 902795131..000000000 --- a/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h +++ /dev/null @@ -1,133 +0,0 @@ -#ifndef INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H -#define INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H - -#include -#include - -#if LV_HAVE_SSE4_1 -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - __m128 iFloatValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - __m128i complexVal, iIntVal; - int8_t* complexVectorPtr = (int8_t*)complexVector; - - __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); - - for(;number < eighthPoints; number++){ - complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16; - complexVal = _mm_shuffle_epi8(complexVal, moveMask); - - iIntVal = _mm_cvtepi8_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - - complexVal = _mm_srli_si128(complexVal, 4); - iIntVal = _mm_cvtepi8_epi32(complexVal); - iFloatValue = _mm_cvtepi32_ps(iIntVal); - - iFloatValue = _mm_mul_ps(iFloatValue, invScalar); - - _mm_store_ps(iBufferPtr, iFloatValue); - - iBufferPtr += 4; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE4_1 */ - - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - float* iBufferPtr = iBuffer; - - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - __m128 iValue; - - const float iScalar= 1.0 / scalar; - __m128 invScalar = _mm_set_ps1(iScalar); - int8_t* complexVectorPtr = (int8_t*)complexVector; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2; - floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; - - iValue = _mm_load_ps(floatBuffer); - - iValue = _mm_mul_ps(iValue, invScalar); - - _mm_store_ps(iBufferPtr, iValue); - - iBufferPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar; - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex 8 bit vector into I float vector data - \param complexVector The complex input vector - \param iBuffer The I buffer output data - \param scalar The scaling value being multiplied against each data point - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_8sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const int8_t* complexVectorPtr = (const int8_t*)complexVector; - float* iBufferPtr = iBuffer; - const float invScalar = 1.0 / scalar; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar; - complexVectorPtr++; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H */ diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py index fc1ec10ef..bc8f959af 100755 --- a/volk/include/volk/volk_register.py +++ b/volk/include/volk/volk_register.py @@ -55,7 +55,7 @@ functions = []; for line in mfile: - subline = re.search(".*(a16).*", line); + subline = re.search(".*_(a16|u)\.h.*", line); if subline: subsubline = re.search("(?<=volk_).*", subline.group(0)); if subsubline: @@ -70,7 +70,7 @@ datatypes = set(datatypes); for line in mfile: for dt in datatypes: if dt in line: - subline = re.search("(volk_" + dt +"_.*(a16).*\.h)", line); + subline = re.search("(volk_" + dt +"_.*(a16|u).*\.h)", line); if subline: subsubline = re.search(".+(?=\.h)", subline.group(0)); diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index e73b70985..4c151bd6f 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -19,7 +19,8 @@ float uniform() { return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) } -void random_floats (float *buf, unsigned n) +template +void random_floats (t *buf, unsigned n) { for (unsigned i = 0; i < n; i++) buf[i] = uniform (); @@ -28,8 +29,8 @@ void random_floats (float *buf, unsigned n) void load_random_data(void *data, volk_type_t type, unsigned int n) { if(type.is_complex) n *= 2; if(type.is_float) { - assert(type.size == 4); //TODO: double support - random_floats((float *)data, n); + if(type.size == 8) random_floats((double *)data, n); + else random_floats((float *)data, n); } else { float int_max = pow(2, type.size*8); if(type.is_signed) int_max /= 2.0; @@ -54,7 +55,7 @@ void load_random_data(void *data, volk_type_t type, unsigned int n) { else ((uint8_t *)data)[i] = (uint8_t) scaled_rand; break; default: - throw; //no shenanigans here + throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here } } } @@ -94,6 +95,9 @@ static std::vector get_arch_list(const int archs[]) { case (1< 1); + if(name.size() < 2) throw std::string("name too short to be a datatype"); //is it a scalar? if(name[0] == 's') { @@ -138,7 +142,7 @@ volk_type_t volk_type_from_string(std::string name) { //get the data size int last_size_pos = name.find_last_of("0123456789"); - if(last_size_pos < 0) throw 0; + if(last_size_pos < 0) throw std::string("no size spec in type ").append(name); //will throw if malformed int size = boost::lexical_cast(name.substr(0, last_size_pos+1)); @@ -182,12 +186,14 @@ static void get_signatures_from_name(std::vector &inputsig, //ok. we're assuming a string in the form //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment) - enum { SIDE_INPUT, SIDE_OUTPUT } side = SIDE_INPUT; + enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT; std::string fn_name; volk_type_t type; BOOST_FOREACH(std::string token, toked) { try { type = volk_type_from_string(token); + if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name... + if(side == SIDE_INPUT) inputsig.push_back(type); else outputsig.push_back(type); } catch (...){ @@ -201,9 +207,11 @@ static void get_signatures_from_name(std::vector &inputsig, } } else if(side == SIDE_INPUT) { //it's the function name, at least it better be - side = SIDE_OUTPUT; - fn_name = token; - } else { + side = SIDE_NAME; + fn_name.append("_"); + fn_name.append(token); + } + else if(side == SIDE_OUTPUT) { if(token != toked.back()) throw; //the last token in the name is the alignment } } @@ -236,20 +244,40 @@ inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, void *outbuff, std::vect while(iter--) func(outbuff, inbuffs[0], scalar, vlen, arch.c_str()); } +inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, void *outbuff, std::vector &inbuffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(outbuff, inbuffs[0], inbuffs[1], scalar, vlen, arch.c_str()); +} + template bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { + bool fail = false; + int print_max_errs = 10; for(int i=0; i tol) return 1; + if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) { + fail=true; + if(print_max_errs-- > 0) { + std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl; + } + } } - return 0; + + return fail; } template -bool icompare(t *in1, t *in2, unsigned int vlen) { +bool icompare(t *in1, t *in2, unsigned int vlen, float tol) { + bool fail = false; + int print_max_errs = 10; for(int i=0; i 0) { + std::cout << "offset " << i << " in1: " << int(((t *)(in1))[i]) << " in2: " << int(((t *)(in2))[i]) << std::endl; + } + } } - return 0; + + return fail; } bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, float tol, int vlen, int iter) { @@ -300,7 +328,7 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, load_random_data(inbuffs[i], inputsig[i], vlen); } } - + //now run the test clock_t start, end; for(int i = 0; i < arch_list.size(); i++) { @@ -311,18 +339,22 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, if(inputsc.size() == 0) { run_cast_test1((volk_fn_1arg)(manual_func), outbuffs[i], vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), outbuffs[i], 1000.0, vlen, iter, arch_list[i]); + run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), outbuffs[i], 255.0, vlen, iter, arch_list[i]); } else throw "unsupported 1 arg function >1 scalars"; break; case 2: if(inputsc.size() == 0) { run_cast_test2((volk_fn_2arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), outbuffs[i], inbuffs, 1000.0, vlen, iter, arch_list[i]); + run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), outbuffs[i], inbuffs, 255.0, vlen, iter, arch_list[i]); } else throw "unsupported 2 arg function >1 scalars"; break; case 3: - run_cast_test3((volk_fn_3arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); + if(inputsc.size() == 0) { + run_cast_test3((volk_fn_3arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); + } else if(inputsc.size() == 1 && inputsc[0].is_float) { + run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), outbuffs[i], inbuffs, 255.0, vlen, iter, arch_list[i]); + } else throw "unsupported 3 arg function >1 scalars"; break; case 4: run_cast_test4((volk_fn_4arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); @@ -337,29 +369,24 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, } //and now compare each output to the generic output //first we have to know which output is the generic one, they aren't in order... - int generic_offset; + int generic_offset=0; for(int i=0; i> 8)); int16ComplexVectorPtr++; } } @@ -68,10 +68,10 @@ static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, con */ static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){ unsigned int number = 0; - const int16_t* complexVectorPtr = (int16_t*)complexVector; + int16_t* complexVectorPtr = (int16_t*)complexVector; int8_t* iBufferPtr = iBuffer; for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256); + *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); complexVectorPtr++; } } diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h deleted file mode 100644 index 6042e6d62..000000000 --- a/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef INCLUDED_volk_32fc_deinterleave_real_16i_a16_H -#define INCLUDED_volk_32fc_deinterleave_real_16i_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - - __m128 vScalar = _mm_set_ps1(scalar); - - __m128 cplxValue1, cplxValue2, iValue; - - float floatBuffer[4] __attribute__((aligned(128))); - - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - - iValue = _mm_mul_ps(iValue, vScalar); - - _mm_store_ps(floatBuffer, iValue); - *iBufferPtr++ = (int16_t)(floatBuffer[0]); - *iBufferPtr++ = (int16_t)(floatBuffer[1]); - *iBufferPtr++ = (int16_t)(floatBuffer[2]); - *iBufferPtr++ = (int16_t)(floatBuffer[3]); - } - - number = quarterPoints * 4; - iBufferPtr = &iBuffer[number]; - for(; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data - \param complexVector The complex input vector - \param scalar The value to be multiply against each of the input values - \param iBuffer The I buffer output data - \param num_points The number of complex data values to be deinterleaved -*/ -static inline void volk_32fc_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - int16_t* iBufferPtr = iBuffer; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); - complexVectorPtr++; - } - -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_deinterleave_real_16i_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h new file mode 100644 index 000000000..31465bff9 --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_deinterleave_real_16i_a16.h @@ -0,0 +1,80 @@ +#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H +#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_s32f_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + + __m128 vScalar = _mm_set_ps1(scalar); + + __m128 cplxValue1, cplxValue2, iValue; + + float floatBuffer[4] __attribute__((aligned(128))); + + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + + iValue = _mm_mul_ps(iValue, vScalar); + + _mm_store_ps(floatBuffer, iValue); + *iBufferPtr++ = (int16_t)(floatBuffer[0]); + *iBufferPtr++ = (int16_t)(floatBuffer[1]); + *iBufferPtr++ = (int16_t)(floatBuffer[2]); + *iBufferPtr++ = (int16_t)(floatBuffer[3]); + } + + number = quarterPoints * 4; + iBufferPtr = &iBuffer[number]; + for(; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data + \param complexVector The complex input vector + \param scalar The value to be multiply against each of the input values + \param iBuffer The I buffer output data + \param num_points The number of complex data values to be deinterleaved +*/ +static inline void volk_32fc_s32f_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + int16_t* iBufferPtr = iBuffer; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar); + complexVectorPtr++; + } + +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a16_H */ diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index 4c151bd6f..8f57a9b90 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -253,6 +253,7 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { bool fail = false; int print_max_errs = 10; for(int i=0; i tol) { fail=true; if(print_max_errs-- > 0) { @@ -265,14 +266,14 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { } template -bool icompare(t *in1, t *in2, unsigned int vlen, float tol) { +bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) { bool fail = false; int print_max_errs = 10; for(int i=0; i tol) { fail=true; if(print_max_errs-- > 0) { - std::cout << "offset " << i << " in1: " << int(((t *)(in1))[i]) << " in2: " << int(((t *)(in2))[i]) << std::endl; + std::cout << "offset " << i << " in1: " << static_cast(t(((t *)(in1))[i])) << " in2: " << static_cast(t(((t *)(in2))[i])) << std::endl; } } } @@ -339,21 +340,21 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, if(inputsc.size() == 0) { run_cast_test1((volk_fn_1arg)(manual_func), outbuffs[i], vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), outbuffs[i], 255.0, vlen, iter, arch_list[i]); + run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), outbuffs[i], 127.0, vlen, iter, arch_list[i]); } else throw "unsupported 1 arg function >1 scalars"; break; case 2: if(inputsc.size() == 0) { run_cast_test2((volk_fn_2arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), outbuffs[i], inbuffs, 255.0, vlen, iter, arch_list[i]); + run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), outbuffs[i], inbuffs, 127.0, vlen, iter, arch_list[i]); } else throw "unsupported 2 arg function >1 scalars"; break; case 3: if(inputsc.size() == 0) { run_cast_test3((volk_fn_3arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), outbuffs[i], inbuffs, 255.0, vlen, iter, arch_list[i]); + run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), outbuffs[i], inbuffs, 127.0, vlen, iter, arch_list[i]); } else throw "unsupported 3 arg function >1 scalars"; break; case 4: @@ -375,7 +376,7 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, //now compare if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know - + //TODO: loop over the output signature as well bool fail = false; for(int i=0; i -#include - -#if LV_HAVE_SSE -#include -/*! - \brief Calculates the spectral noise floor of an input power spectrum - - Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. - - \param realDataPoints The input power spectrum - \param num_points The number of data points in the input power spectrum vector - \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 - \param noiseFloorAmplitude The noise floor of the input spectrum, in dB -*/ -static inline void volk_32f_calc_spectral_noise_floor_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* dataPointsPtr = realDataPoints; - float avgPointsVector[4] __attribute__((aligned(128))); - - __m128 dataPointsVal; - __m128 avgPointsVal = _mm_setzero_ps(); - // Calculate the sum (for mean) for all points - for(; number < quarterPoints; number++){ - - dataPointsVal = _mm_load_ps(dataPointsPtr); - - dataPointsPtr += 4; - - avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); - } - - _mm_store_ps(avgPointsVector, avgPointsVal); - - float sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more - const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; - - dataPointsPtr = realDataPoints; // Reset the dataPointsPtr - __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); - __m128 vOnesVector = _mm_set_ps1(1.0); - __m128 vValidBinCount = _mm_setzero_ps(); - avgPointsVal = _mm_setzero_ps(); - __m128 compareMask; - number = 0; - // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude - for(; number < quarterPoints; number++){ - - dataPointsVal = _mm_load_ps(dataPointsPtr); - - dataPointsPtr += 4; - - // Identify which items do not exceed the mean amplitude - compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); - - // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude - avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); - - // Count the number of bins which do not exceed the mean amplitude - vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); - } - - // Calculate the mean from the remaining data points - _mm_store_ps(avgPointsVector, avgPointsVal); - - sumMean = 0.0; - sumMean += avgPointsVector[0]; - sumMean += avgPointsVector[1]; - sumMean += avgPointsVector[2]; - sumMean += avgPointsVector[3]; - - // Calculate the number of valid bins from the remaning count - float validBinCountVector[4] __attribute__((aligned(128))); - _mm_store_ps(validBinCountVector, vValidBinCount); - - float validBinCount = 0; - validBinCount += validBinCountVector[0]; - validBinCount += validBinCountVector[1]; - validBinCount += validBinCountVector[2]; - validBinCount += validBinCountVector[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(realDataPoints[number] <= meanAmplitude){ - sumMean += realDataPoints[number]; - validBinCount += 1.0; - } - } - - float localNoiseFloorAmplitude = 0; - if(validBinCount > 0.0){ - localNoiseFloorAmplitude = sumMean / validBinCount; - } - else{ - localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... - } - - *noiseFloorAmplitude = localNoiseFloorAmplitude; -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC -/*! - \brief Calculates the spectral noise floor of an input power spectrum - - Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. - - \param realDataPoints The input power spectrum - \param num_points The number of data points in the input power spectrum vector - \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 - \param noiseFloorAmplitude The noise floor of the input spectrum, in dB -*/ -static inline void volk_32f_calc_spectral_noise_floor_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ - float sumMean = 0.0; - unsigned int number; - // find the sum (for mean), etc - for(number = 0; number < num_points; number++){ - // sum (for mean) - sumMean += realDataPoints[number]; - } - - // calculate the spectral mean - // +20 because for the comparison below we only want to throw out bins - // that are significantly higher (and would, thus, affect the mean more) - const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; - - // now throw out any bins higher than the mean - sumMean = 0.0; - unsigned int newNumDataPoints = num_points; - for(number = 0; number < num_points; number++){ - if (realDataPoints[number] <= meanAmplitude) - sumMean += realDataPoints[number]; - else - newNumDataPoints--; - } - - float localNoiseFloorAmplitude = 0.0; - if (newNumDataPoints == 0) // in the odd case that all - localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! - else - localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); - - *noiseFloorAmplitude = localNoiseFloorAmplitude; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32f_calc_spectral_noise_floor_a16_H */ diff --git a/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h new file mode 100644 index 000000000..168245d65 --- /dev/null +++ b/volk/include/volk/volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h @@ -0,0 +1,167 @@ +#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H +#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* dataPointsPtr = realDataPoints; + float avgPointsVector[4] __attribute__((aligned(128))); + + __m128 dataPointsVal; + __m128 avgPointsVal = _mm_setzero_ps(); + // Calculate the sum (for mean) for all points + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal); + } + + _mm_store_ps(avgPointsVector, avgPointsVal); + + float sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more + const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue; + + dataPointsPtr = realDataPoints; // Reset the dataPointsPtr + __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude); + __m128 vOnesVector = _mm_set_ps1(1.0); + __m128 vValidBinCount = _mm_setzero_ps(); + avgPointsVal = _mm_setzero_ps(); + __m128 compareMask; + number = 0; + // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude + for(; number < quarterPoints; number++){ + + dataPointsVal = _mm_load_ps(dataPointsPtr); + + dataPointsPtr += 4; + + // Identify which items do not exceed the mean amplitude + compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector); + + // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude + avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal)); + + // Count the number of bins which do not exceed the mean amplitude + vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector)); + } + + // Calculate the mean from the remaining data points + _mm_store_ps(avgPointsVector, avgPointsVal); + + sumMean = 0.0; + sumMean += avgPointsVector[0]; + sumMean += avgPointsVector[1]; + sumMean += avgPointsVector[2]; + sumMean += avgPointsVector[3]; + + // Calculate the number of valid bins from the remaning count + float validBinCountVector[4] __attribute__((aligned(128))); + _mm_store_ps(validBinCountVector, vValidBinCount); + + float validBinCount = 0; + validBinCount += validBinCountVector[0]; + validBinCount += validBinCountVector[1]; + validBinCount += validBinCountVector[2]; + validBinCount += validBinCountVector[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(realDataPoints[number] <= meanAmplitude){ + sumMean += realDataPoints[number]; + validBinCount += 1.0; + } + } + + float localNoiseFloorAmplitude = 0; + if(validBinCount > 0.0){ + localNoiseFloorAmplitude = sumMean / validBinCount; + } + else{ + localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal... + } + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC +/*! + \brief Calculates the spectral noise floor of an input power spectrum + + Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB). Provides a rough estimation of the signal noise floor. + + \param realDataPoints The input power spectrum + \param num_points The number of data points in the input power spectrum vector + \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20 + \param noiseFloorAmplitude The noise floor of the input spectrum, in dB +*/ +static inline void volk_32f_s32f_calc_spectral_noise_floor_32f_a16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){ + float sumMean = 0.0; + unsigned int number; + // find the sum (for mean), etc + for(number = 0; number < num_points; number++){ + // sum (for mean) + sumMean += realDataPoints[number]; + } + + // calculate the spectral mean + // +20 because for the comparison below we only want to throw out bins + // that are significantly higher (and would, thus, affect the mean more) + const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue; + + // now throw out any bins higher than the mean + sumMean = 0.0; + unsigned int newNumDataPoints = num_points; + for(number = 0; number < num_points; number++){ + if (realDataPoints[number] <= meanAmplitude) + sumMean += realDataPoints[number]; + else + newNumDataPoints--; + } + + float localNoiseFloorAmplitude = 0.0; + if (newNumDataPoints == 0) // in the odd case that all + localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal! + else + localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints); + + *noiseFloorAmplitude = localNoiseFloorAmplitude; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a16_H */ diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index 8f57a9b90..b1c55fc05 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -66,6 +66,7 @@ void *make_aligned_buffer(unsigned int len, unsigned int size) { int ret; ret = posix_memalign((void**)&buf, 16, len * size); assert(ret == 0); + memset(buf, 0x00, len*size); return buf; } @@ -220,32 +221,32 @@ static void get_signatures_from_name(std::vector &inputsig, assert(inputsig.size() != 0); } -inline void run_cast_test1(volk_fn_1arg func, void *buff, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buff, vlen, arch.c_str()); +inline void run_cast_test1(volk_fn_1arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], vlen, arch.c_str()); } -inline void run_cast_test2(volk_fn_2arg func, void *outbuff, std::vector &inbuffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(outbuff, inbuffs[0], vlen, arch.c_str()); +inline void run_cast_test2(volk_fn_2arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str()); } -inline void run_cast_test3(volk_fn_3arg func, void *outbuff, std::vector &inbuffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(outbuff, inbuffs[0], inbuffs[1], vlen, arch.c_str()); +inline void run_cast_test3(volk_fn_3arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str()); } -inline void run_cast_test4(volk_fn_4arg func, void *outbuff, std::vector &inbuffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(outbuff, inbuffs[0], inbuffs[1], inbuffs[2], vlen, arch.c_str()); +inline void run_cast_test4(volk_fn_4arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str()); } -inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, void *buff, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buff, scalar, vlen, arch.c_str()); +inline void run_cast_test1_s32f(volk_fn_1arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); } -inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, void *outbuff, std::vector &inbuffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(outbuff, inbuffs[0], scalar, vlen, arch.c_str()); +inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); } -inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, void *outbuff, std::vector &inbuffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(outbuff, inbuffs[0], inbuffs[1], scalar, vlen, arch.c_str()); +inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); } template @@ -253,7 +254,7 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { bool fail = false; int print_max_errs = 10; for(int i=0; i tol) { fail=true; if(print_max_errs-- > 0) { @@ -291,74 +292,70 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, std::vector inputsig, outputsig; get_signatures_from_name(inputsig, outputsig, name); - std::vector inputsc, outputsc; + //pull the input scalars into their own vector + std::vector inputsc; for(int i=0; i inbuffs, outbuffs; + std::vector inbuffs; + + make_buffer_for_signature(inbuffs, inputsig, vlen); + for(int i=0; i > test_data; + for(int i=0; i arch_buffs; + for(int j=0; j both_sigs; + both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end()); + both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end()); //now run the test clock_t start, end; for(int i = 0; i < arch_list.size(); i++) { start = clock(); - switch(inputsig.size() + outputsig.size()) { + switch(both_sigs.size()) { case 1: if(inputsc.size() == 0) { - run_cast_test1((volk_fn_1arg)(manual_func), outbuffs[i], vlen, iter, arch_list[i]); + run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), outbuffs[i], 127.0, vlen, iter, arch_list[i]); + run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], 127.0, vlen, iter, arch_list[i]); } else throw "unsupported 1 arg function >1 scalars"; break; case 2: if(inputsc.size() == 0) { - run_cast_test2((volk_fn_2arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); + run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), outbuffs[i], inbuffs, 127.0, vlen, iter, arch_list[i]); + run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], 127.0, vlen, iter, arch_list[i]); } else throw "unsupported 2 arg function >1 scalars"; break; case 3: if(inputsc.size() == 0) { - run_cast_test3((volk_fn_3arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); + run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), outbuffs[i], inbuffs, 127.0, vlen, iter, arch_list[i]); + run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], 127.0, vlen, iter, arch_list[i]); } else throw "unsupported 3 arg function >1 scalars"; break; case 4: - run_cast_test4((volk_fn_4arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]); + run_cast_test4((volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); break; default: throw "no function handler for this signature"; @@ -375,61 +372,63 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, if(arch_list[i] == "generic") generic_offset=i; //now compare - if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know - //TODO: loop over the output signature as well + //if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know + bool fail = false; + bool fail_global = false; for(int i=0; i> 8)); + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); int16ComplexVectorPtr++; } } @@ -71,7 +71,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, c int16_t* complexVectorPtr = (int16_t*)complexVector; int8_t* iBufferPtr = iBuffer; for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); + *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ / 256)); complexVectorPtr++; } } diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index b1c55fc05..67ce5ddef 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -282,7 +282,7 @@ bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) { return fail; } -bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, float tol, int vlen, int iter) { +bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, float tol, float scalar, int vlen, int iter) { std::cout << "RUN_VOLK_TESTS: " << name << std::endl; //first let's get a list of available architectures for the test @@ -337,21 +337,21 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, if(inputsc.size() == 0) { run_cast_test1((volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], 127.0, vlen, iter, arch_list[i]); + run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); } else throw "unsupported 1 arg function >1 scalars"; break; case 2: if(inputsc.size() == 0) { run_cast_test2((volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], 127.0, vlen, iter, arch_list[i]); + run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); } else throw "unsupported 2 arg function >1 scalars"; break; case 3: if(inputsc.size() == 0) { run_cast_test3((volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } else if(inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], 127.0, vlen, iter, arch_list[i]); + run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); } else throw "unsupported 3 arg function >1 scalars"; break; case 4: diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h index 79fc8f006..e2539060a 100644 --- a/volk/lib/qa_utils.h +++ b/volk/lib/qa_utils.h @@ -18,9 +18,9 @@ volk_type_t volk_type_from_string(std::string); float uniform(void); void random_floats(float *buf, unsigned n); -bool run_volk_tests(const int[], void(*)(), std::string, float, int, int); +bool run_volk_tests(const int[], void(*)(), std::string, float, float, int, int); -#define VOLK_RUN_TESTS(func, tol, len, iter) BOOST_CHECK_EQUAL(run_volk_tests(func##_arch_defs, (void (*)())func##_manual, std::string(#func), tol, len, iter), 0) +#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_CHECK_EQUAL(run_volk_tests(func##_arch_defs, (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter), 0) typedef void (*volk_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place typedef void (*volk_fn_2arg)(void *, void *, unsigned int, const char*); diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 4dd7f7599..9f4934dc0 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -7,93 +7,93 @@ BOOST_AUTO_TEST_CASE(volk_test_all) { //in order... // VOLK_RUN_TESTS(volk_16i_x5_add_quad_16i_x4_a16, 1e-4, 2046, 10000); // VOLK_RUN_TESTS(volk_16i_branch_4_state_8_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a16, 1e-5, 2046, 10000); - VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a16, 0, 2046, 10000); - VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a16, 1, 2046, 10000); - VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a16, 1e-5, 2046, 10000); - VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_16i_convert_8i_a16, 0, 2046, 10000); - VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 2046, 10000); - VOLK_RUN_TESTS(volk_16i_max_star_16i_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a16, 1e-4, 2046, 10000); -// VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a16, 1e-4, 2046, 10000); -// VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_16u_byteswap_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_x2_add_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_32f_power_32fc_a16, 1e-4, 2046, 1000); - VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_index_max_16u_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a16, 1, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a16, 1, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_convert_64f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 0, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 32768.0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_real_32f_a16, 1e-5, 32768.0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_deinterleave_real_8i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_deinterleave_16i_x2_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_s32f_deinterleave_32f_x2_a16, 1e-4, 32768.0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_deinterleave_real_16i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_magnitude_16i_a16, 1, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16ic_s32f_magnitude_32f_a16, 1e-5, 32768.0, 2046, 10000); + VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_a16, 1e-4, 32768.0, 2046, 10000); + VOLK_RUN_TESTS(volk_16i_s32f_convert_32f_u, 1e-4, 32768.0, 2046, 10000); + VOLK_RUN_TESTS(volk_16i_convert_8i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16i_max_star_16i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16i_max_star_horizontal_16i_a16, 0, 0, 2046, 10000); +// VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a16, 1e-4, 0, 2046, 10000); +// VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_16u_byteswap_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_add_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_32f_power_32fc_a16, 1e-4, 0, 2046, 1000); + VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a16, 1e-4, 20.0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a16, 1e-4, 10.0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a16, 0, 32768, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_index_max_16u_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a16, 0, 32768, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a16, 1, 32768, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_u, 1, 32768, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_a16, 1, 2<<31, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_convert_64f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 0, 128, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 0, 128, 2046, 10000); // VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16, 1e-4, 10, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 2046, 10000); // VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_index_max_16u_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 0, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a16, 0, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_x2_max_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_x2_min_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_s32f_normalize_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_sqrt_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32i_x2_and_32i_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32i_x2_or_32i_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_32u_byteswap_a16, 1e-4, 2046, 10000); -// VOLK_RUN_TESTS(volk_32u_popcnt_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_64f_convert_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_64f_x2_max_64f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_64f_x2_min_64f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_64u_byteswap_a16, 1e-4, 2046, 10000); -// VOLK_RUN_TESTS(volk_64u_popcnt_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_8i_convert_16i_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_8i_convert_16i_u, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a16, 1e-4, 2046, 10000); - VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_index_max_16u_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 0, 32768, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_max_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_min_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_multiply_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_normalize_a16, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_power_32f_a16, 1e-4, 4, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_sqrt_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_stddev_32f_a16, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_stddev_and_mean_32f_x2_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_subtract_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x3_sum_of_poly_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32i_x2_and_32i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_a16, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_32i_s32f_convert_32f_u, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_32i_x2_or_32i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32u_byteswap_a16, 0, 0, 2046, 10000); +// VOLK_RUN_TESTS(volk_32u_popcnt_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_64f_convert_32f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_64f_convert_32f_u, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_64f_x2_max_64f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_64f_x2_min_64f_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_64u_byteswap_a16, 0, 0, 2046, 10000); +// VOLK_RUN_TESTS(volk_64u_popcnt_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_8ic_deinterleave_16i_x2_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_32f_x2_a16, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_8ic_deinterleave_real_16i_a16, 0, 256, 2046, 10000); + VOLK_RUN_TESTS(volk_8ic_s32f_deinterleave_real_32f_a16, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_8ic_deinterleave_real_8i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_8ic_x2_multiply_conjugate_16ic_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_8ic_x2_s32f_multiply_conjugate_32fc_a16, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_8i_convert_16i_a16, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_8i_convert_16i_u, 0, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_a16, 1e-4, 100, 2046, 10000); + VOLK_RUN_TESTS(volk_8i_s32f_convert_32f_u, 1e-4, 100, 2046, 10000); } -- cgit From 6091bad60cdfdf21624da452c7a8b74405345070 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 21 Jan 2011 15:41:30 -0800 Subject: Volk: removed all the old QA code that is covered by the test framework. --- volk/configure.ac | 1 - volk/lib/Makefile.am | 1 - volk/lib/qa_16s_convert_32f_aligned16.cc | 74 ------- volk/lib/qa_16s_convert_32f_aligned16.h | 18 -- volk/lib/qa_16s_convert_32f_unaligned16.cc | 74 ------- volk/lib/qa_16s_convert_32f_unaligned16.h | 18 -- volk/lib/qa_16s_convert_8s_aligned16.cc | 61 ------ volk/lib/qa_16s_convert_8s_aligned16.h | 18 -- volk/lib/qa_16s_convert_8s_unaligned16.cc | 61 ------ volk/lib/qa_16s_convert_8s_unaligned16.h | 18 -- volk/lib/qa_16s_max_star_aligned16.cc | 65 ------- volk/lib/qa_16s_max_star_aligned16.h | 18 -- volk/lib/qa_16s_max_star_horizontal_aligned16.cc | 79 -------- volk/lib/qa_16s_max_star_horizontal_aligned16.h | 18 -- volk/lib/qa_16sc_deinterleave_16s_aligned16.cc | 89 --------- volk/lib/qa_16sc_deinterleave_16s_aligned16.h | 18 -- volk/lib/qa_16sc_deinterleave_32f_aligned16.cc | 75 -------- volk/lib/qa_16sc_deinterleave_32f_aligned16.h | 18 -- .../lib/qa_16sc_deinterleave_real_16s_aligned16.cc | 72 ------- volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h | 18 -- .../lib/qa_16sc_deinterleave_real_32f_aligned16.cc | 124 ------------ volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h | 18 -- volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc | 70 ------- volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h | 18 -- volk/lib/qa_16sc_magnitude_16s_aligned16.cc | 81 -------- volk/lib/qa_16sc_magnitude_16s_aligned16.h | 18 -- volk/lib/qa_16sc_magnitude_32f_aligned16.cc | 131 ------------- volk/lib/qa_16sc_magnitude_32f_aligned16.h | 18 -- volk/lib/qa_16u_byteswap_aligned16.cc | 71 ------- volk/lib/qa_16u_byteswap_aligned16.h | 18 -- volk/lib/qa_32f_accumulator_aligned16.cc | 57 ------ volk/lib/qa_32f_accumulator_aligned16.h | 18 -- volk/lib/qa_32f_add_aligned16.cc | 123 ------------ volk/lib/qa_32f_add_aligned16.h | 18 -- .../qa_32f_calc_spectral_noise_floor_aligned16.cc | 60 ------ .../qa_32f_calc_spectral_noise_floor_aligned16.h | 18 -- volk/lib/qa_32f_convert_16s_aligned16.cc | 71 ------- volk/lib/qa_32f_convert_16s_aligned16.h | 18 -- volk/lib/qa_32f_convert_16s_unaligned16.cc | 71 ------- volk/lib/qa_32f_convert_16s_unaligned16.h | 18 -- volk/lib/qa_32f_convert_32s_aligned16.cc | 71 ------- volk/lib/qa_32f_convert_32s_aligned16.h | 18 -- volk/lib/qa_32f_convert_32s_unaligned16.cc | 71 ------- volk/lib/qa_32f_convert_32s_unaligned16.h | 18 -- volk/lib/qa_32f_convert_64f_aligned16.cc | 61 ------ volk/lib/qa_32f_convert_64f_aligned16.h | 18 -- volk/lib/qa_32f_convert_64f_unaligned16.cc | 61 ------ volk/lib/qa_32f_convert_64f_unaligned16.h | 18 -- volk/lib/qa_32f_convert_8s_aligned16.cc | 71 ------- volk/lib/qa_32f_convert_8s_aligned16.h | 18 -- volk/lib/qa_32f_convert_8s_unaligned16.cc | 71 ------- volk/lib/qa_32f_convert_8s_unaligned16.h | 18 -- volk/lib/qa_32f_divide_aligned16.cc | 133 ------------- volk/lib/qa_32f_divide_aligned16.h | 18 -- volk/lib/qa_32f_dot_prod_aligned16.cc | 183 ------------------ volk/lib/qa_32f_dot_prod_aligned16.h | 18 -- volk/lib/qa_32f_dot_prod_unaligned16.cc | 190 ------------------ volk/lib/qa_32f_dot_prod_unaligned16.h | 18 -- volk/lib/qa_32f_interleave_16sc_aligned16.cc | 76 -------- volk/lib/qa_32f_interleave_16sc_aligned16.h | 18 -- volk/lib/qa_32f_interleave_32fc_aligned16.cc | 63 ------ volk/lib/qa_32f_interleave_32fc_aligned16.h | 18 -- volk/lib/qa_32f_max_aligned16.cc | 70 ------- volk/lib/qa_32f_max_aligned16.h | 18 -- volk/lib/qa_32f_min_aligned16.cc | 70 ------- volk/lib/qa_32f_min_aligned16.h | 18 -- volk/lib/qa_32f_multiply_aligned16.cc | 123 ------------ volk/lib/qa_32f_multiply_aligned16.h | 18 -- volk/lib/qa_32f_normalize_aligned16.cc | 79 -------- volk/lib/qa_32f_normalize_aligned16.h | 18 -- volk/lib/qa_32f_power_aligned16.cc | 95 --------- volk/lib/qa_32f_power_aligned16.h | 18 -- volk/lib/qa_32f_sqrt_aligned16.cc | 128 ------------ volk/lib/qa_32f_sqrt_aligned16.h | 18 -- volk/lib/qa_32f_stddev_aligned16.cc | 75 -------- volk/lib/qa_32f_stddev_aligned16.h | 18 -- volk/lib/qa_32f_stddev_and_mean_aligned16.cc | 76 -------- volk/lib/qa_32f_stddev_and_mean_aligned16.h | 18 -- volk/lib/qa_32f_subtract_aligned16.cc | 70 ------- volk/lib/qa_32f_subtract_aligned16.h | 18 -- volk/lib/qa_32f_sum_of_poly_aligned16.cc | 142 -------------- volk/lib/qa_32f_sum_of_poly_aligned16.h | 18 -- volk/lib/qa_32fc_32f_multiply_aligned16.cc | 75 -------- volk/lib/qa_32fc_32f_multiply_aligned16.h | 18 -- volk/lib/qa_32fc_32f_power_32fc_aligned16.cc | 83 -------- volk/lib/qa_32fc_32f_power_32fc_aligned16.h | 18 -- volk/lib/qa_32fc_atan2_32f_aligned16.cc | 76 -------- volk/lib/qa_32fc_atan2_32f_aligned16.h | 18 -- volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc | 138 ------------- volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h | 18 -- volk/lib/qa_32fc_deinterleave_32f_aligned16.cc | 64 ------ volk/lib/qa_32fc_deinterleave_32f_aligned16.h | 18 -- volk/lib/qa_32fc_deinterleave_64f_aligned16.cc | 64 ------ volk/lib/qa_32fc_deinterleave_64f_aligned16.h | 18 -- .../lib/qa_32fc_deinterleave_real_16s_aligned16.cc | 61 ------ volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h | 18 -- .../lib/qa_32fc_deinterleave_real_32f_aligned16.cc | 61 ------ volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h | 18 -- .../lib/qa_32fc_deinterleave_real_64f_aligned16.cc | 61 ------ volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h | 18 -- volk/lib/qa_32fc_dot_prod_aligned16.cc | 214 --------------------- volk/lib/qa_32fc_dot_prod_aligned16.h | 20 -- volk/lib/qa_32fc_magnitude_16s_aligned16.cc | 80 -------- volk/lib/qa_32fc_magnitude_16s_aligned16.h | 18 -- volk/lib/qa_32fc_magnitude_32f_aligned16.cc | 80 -------- volk/lib/qa_32fc_magnitude_32f_aligned16.h | 18 -- volk/lib/qa_32fc_multiply_aligned16.cc | 98 ---------- volk/lib/qa_32fc_multiply_aligned16.h | 18 -- volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc | 64 ------ volk/lib/qa_32fc_power_spectrum_32f_aligned16.h | 18 -- volk/lib/qa_32fc_square_dist_aligned16.cc | 91 --------- volk/lib/qa_32fc_square_dist_aligned16.h | 18 -- .../qa_32fc_square_dist_scalar_mult_aligned16.cc | 96 --------- .../qa_32fc_square_dist_scalar_mult_aligned16.h | 18 -- volk/lib/qa_32s_and_aligned16.cc | 70 ------- volk/lib/qa_32s_and_aligned16.h | 18 -- volk/lib/qa_32s_convert_32f_aligned16.cc | 61 ------ volk/lib/qa_32s_convert_32f_aligned16.h | 18 -- volk/lib/qa_32s_convert_32f_unaligned16.cc | 61 ------ volk/lib/qa_32s_convert_32f_unaligned16.h | 18 -- volk/lib/qa_32s_or_aligned16.cc | 70 ------- volk/lib/qa_32s_or_aligned16.h | 18 -- volk/lib/qa_32u_byteswap_aligned16.cc | 60 ------ volk/lib/qa_32u_byteswap_aligned16.h | 18 -- volk/lib/qa_64f_convert_32f_aligned16.cc | 61 ------ volk/lib/qa_64f_convert_32f_aligned16.h | 18 -- volk/lib/qa_64f_convert_32f_unaligned16.cc | 61 ------ volk/lib/qa_64f_convert_32f_unaligned16.h | 18 -- volk/lib/qa_64f_max_aligned16.cc | 61 ------ volk/lib/qa_64f_max_aligned16.h | 18 -- volk/lib/qa_64f_min_aligned16.cc | 61 ------ volk/lib/qa_64f_min_aligned16.h | 18 -- volk/lib/qa_64u_byteswap_aligned16.cc | 60 ------ volk/lib/qa_64u_byteswap_aligned16.h | 18 -- volk/lib/qa_8s_convert_16s_aligned16.cc | 64 ------ volk/lib/qa_8s_convert_16s_aligned16.h | 18 -- volk/lib/qa_8s_convert_16s_unaligned16.cc | 64 ------ volk/lib/qa_8s_convert_16s_unaligned16.h | 18 -- volk/lib/qa_8s_convert_32f_aligned16.cc | 72 ------- volk/lib/qa_8s_convert_32f_aligned16.h | 18 -- volk/lib/qa_8s_convert_32f_unaligned16.cc | 64 ------ volk/lib/qa_8s_convert_32f_unaligned16.h | 18 -- volk/lib/qa_8sc_deinterleave_16s_aligned16.cc | 68 ------- volk/lib/qa_8sc_deinterleave_16s_aligned16.h | 18 -- volk/lib/qa_8sc_deinterleave_32f_aligned16.cc | 135 ------------- volk/lib/qa_8sc_deinterleave_32f_aligned16.h | 18 -- volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc | 65 ------- volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h | 18 -- volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc | 139 ------------- volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h | 18 -- volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc | 61 ------ volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h | 18 -- .../qa_8sc_multiply_conjugate_16sc_aligned16.cc | 87 --------- .../lib/qa_8sc_multiply_conjugate_16sc_aligned16.h | 18 -- .../qa_8sc_multiply_conjugate_32fc_aligned16.cc | 87 --------- .../lib/qa_8sc_multiply_conjugate_32fc_aligned16.h | 18 -- volk/lib/qa_volk.cc | 211 -------------------- volk/lib/qa_volk.h | 36 ---- volk/lib/test_all.cc | 82 -------- 159 files changed, 8145 deletions(-) delete mode 100644 volk/lib/qa_16s_convert_32f_aligned16.cc delete mode 100644 volk/lib/qa_16s_convert_32f_aligned16.h delete mode 100644 volk/lib/qa_16s_convert_32f_unaligned16.cc delete mode 100644 volk/lib/qa_16s_convert_32f_unaligned16.h delete mode 100644 volk/lib/qa_16s_convert_8s_aligned16.cc delete mode 100644 volk/lib/qa_16s_convert_8s_aligned16.h delete mode 100644 volk/lib/qa_16s_convert_8s_unaligned16.cc delete mode 100644 volk/lib/qa_16s_convert_8s_unaligned16.h delete mode 100644 volk/lib/qa_16s_max_star_aligned16.cc delete mode 100644 volk/lib/qa_16s_max_star_aligned16.h delete mode 100644 volk/lib/qa_16s_max_star_horizontal_aligned16.cc delete mode 100644 volk/lib/qa_16s_max_star_horizontal_aligned16.h delete mode 100644 volk/lib/qa_16sc_deinterleave_16s_aligned16.cc delete mode 100644 volk/lib/qa_16sc_deinterleave_16s_aligned16.h delete mode 100644 volk/lib/qa_16sc_deinterleave_32f_aligned16.cc delete mode 100644 volk/lib/qa_16sc_deinterleave_32f_aligned16.h delete mode 100644 volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc delete mode 100644 volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h delete mode 100644 volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc delete mode 100644 volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h delete mode 100644 volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc delete mode 100644 volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h delete mode 100644 volk/lib/qa_16sc_magnitude_16s_aligned16.cc delete mode 100644 volk/lib/qa_16sc_magnitude_16s_aligned16.h delete mode 100644 volk/lib/qa_16sc_magnitude_32f_aligned16.cc delete mode 100644 volk/lib/qa_16sc_magnitude_32f_aligned16.h delete mode 100644 volk/lib/qa_16u_byteswap_aligned16.cc delete mode 100644 volk/lib/qa_16u_byteswap_aligned16.h delete mode 100644 volk/lib/qa_32f_accumulator_aligned16.cc delete mode 100644 volk/lib/qa_32f_accumulator_aligned16.h delete mode 100644 volk/lib/qa_32f_add_aligned16.cc delete mode 100644 volk/lib/qa_32f_add_aligned16.h delete mode 100644 volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc delete mode 100644 volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h delete mode 100644 volk/lib/qa_32f_convert_16s_aligned16.cc delete mode 100644 volk/lib/qa_32f_convert_16s_aligned16.h delete mode 100644 volk/lib/qa_32f_convert_16s_unaligned16.cc delete mode 100644 volk/lib/qa_32f_convert_16s_unaligned16.h delete mode 100644 volk/lib/qa_32f_convert_32s_aligned16.cc delete mode 100644 volk/lib/qa_32f_convert_32s_aligned16.h delete mode 100644 volk/lib/qa_32f_convert_32s_unaligned16.cc delete mode 100644 volk/lib/qa_32f_convert_32s_unaligned16.h delete mode 100644 volk/lib/qa_32f_convert_64f_aligned16.cc delete mode 100644 volk/lib/qa_32f_convert_64f_aligned16.h delete mode 100644 volk/lib/qa_32f_convert_64f_unaligned16.cc delete mode 100644 volk/lib/qa_32f_convert_64f_unaligned16.h delete mode 100644 volk/lib/qa_32f_convert_8s_aligned16.cc delete mode 100644 volk/lib/qa_32f_convert_8s_aligned16.h delete mode 100644 volk/lib/qa_32f_convert_8s_unaligned16.cc delete mode 100644 volk/lib/qa_32f_convert_8s_unaligned16.h delete mode 100644 volk/lib/qa_32f_divide_aligned16.cc delete mode 100644 volk/lib/qa_32f_divide_aligned16.h delete mode 100644 volk/lib/qa_32f_dot_prod_aligned16.cc delete mode 100644 volk/lib/qa_32f_dot_prod_aligned16.h delete mode 100644 volk/lib/qa_32f_dot_prod_unaligned16.cc delete mode 100644 volk/lib/qa_32f_dot_prod_unaligned16.h delete mode 100644 volk/lib/qa_32f_interleave_16sc_aligned16.cc delete mode 100644 volk/lib/qa_32f_interleave_16sc_aligned16.h delete mode 100644 volk/lib/qa_32f_interleave_32fc_aligned16.cc delete mode 100644 volk/lib/qa_32f_interleave_32fc_aligned16.h delete mode 100644 volk/lib/qa_32f_max_aligned16.cc delete mode 100644 volk/lib/qa_32f_max_aligned16.h delete mode 100644 volk/lib/qa_32f_min_aligned16.cc delete mode 100644 volk/lib/qa_32f_min_aligned16.h delete mode 100644 volk/lib/qa_32f_multiply_aligned16.cc delete mode 100644 volk/lib/qa_32f_multiply_aligned16.h delete mode 100644 volk/lib/qa_32f_normalize_aligned16.cc delete mode 100644 volk/lib/qa_32f_normalize_aligned16.h delete mode 100644 volk/lib/qa_32f_power_aligned16.cc delete mode 100644 volk/lib/qa_32f_power_aligned16.h delete mode 100644 volk/lib/qa_32f_sqrt_aligned16.cc delete mode 100644 volk/lib/qa_32f_sqrt_aligned16.h delete mode 100644 volk/lib/qa_32f_stddev_aligned16.cc delete mode 100644 volk/lib/qa_32f_stddev_aligned16.h delete mode 100644 volk/lib/qa_32f_stddev_and_mean_aligned16.cc delete mode 100644 volk/lib/qa_32f_stddev_and_mean_aligned16.h delete mode 100644 volk/lib/qa_32f_subtract_aligned16.cc delete mode 100644 volk/lib/qa_32f_subtract_aligned16.h delete mode 100644 volk/lib/qa_32f_sum_of_poly_aligned16.cc delete mode 100644 volk/lib/qa_32f_sum_of_poly_aligned16.h delete mode 100644 volk/lib/qa_32fc_32f_multiply_aligned16.cc delete mode 100644 volk/lib/qa_32fc_32f_multiply_aligned16.h delete mode 100644 volk/lib/qa_32fc_32f_power_32fc_aligned16.cc delete mode 100644 volk/lib/qa_32fc_32f_power_32fc_aligned16.h delete mode 100644 volk/lib/qa_32fc_atan2_32f_aligned16.cc delete mode 100644 volk/lib/qa_32fc_atan2_32f_aligned16.h delete mode 100644 volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc delete mode 100644 volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h delete mode 100644 volk/lib/qa_32fc_deinterleave_32f_aligned16.cc delete mode 100644 volk/lib/qa_32fc_deinterleave_32f_aligned16.h delete mode 100644 volk/lib/qa_32fc_deinterleave_64f_aligned16.cc delete mode 100644 volk/lib/qa_32fc_deinterleave_64f_aligned16.h delete mode 100644 volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc delete mode 100644 volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h delete mode 100644 volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc delete mode 100644 volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h delete mode 100644 volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc delete mode 100644 volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h delete mode 100644 volk/lib/qa_32fc_dot_prod_aligned16.cc delete mode 100644 volk/lib/qa_32fc_dot_prod_aligned16.h delete mode 100644 volk/lib/qa_32fc_magnitude_16s_aligned16.cc delete mode 100644 volk/lib/qa_32fc_magnitude_16s_aligned16.h delete mode 100644 volk/lib/qa_32fc_magnitude_32f_aligned16.cc delete mode 100644 volk/lib/qa_32fc_magnitude_32f_aligned16.h delete mode 100644 volk/lib/qa_32fc_multiply_aligned16.cc delete mode 100644 volk/lib/qa_32fc_multiply_aligned16.h delete mode 100644 volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc delete mode 100644 volk/lib/qa_32fc_power_spectrum_32f_aligned16.h delete mode 100644 volk/lib/qa_32fc_square_dist_aligned16.cc delete mode 100644 volk/lib/qa_32fc_square_dist_aligned16.h delete mode 100644 volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc delete mode 100644 volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h delete mode 100644 volk/lib/qa_32s_and_aligned16.cc delete mode 100644 volk/lib/qa_32s_and_aligned16.h delete mode 100644 volk/lib/qa_32s_convert_32f_aligned16.cc delete mode 100644 volk/lib/qa_32s_convert_32f_aligned16.h delete mode 100644 volk/lib/qa_32s_convert_32f_unaligned16.cc delete mode 100644 volk/lib/qa_32s_convert_32f_unaligned16.h delete mode 100644 volk/lib/qa_32s_or_aligned16.cc delete mode 100644 volk/lib/qa_32s_or_aligned16.h delete mode 100644 volk/lib/qa_32u_byteswap_aligned16.cc delete mode 100644 volk/lib/qa_32u_byteswap_aligned16.h delete mode 100644 volk/lib/qa_64f_convert_32f_aligned16.cc delete mode 100644 volk/lib/qa_64f_convert_32f_aligned16.h delete mode 100644 volk/lib/qa_64f_convert_32f_unaligned16.cc delete mode 100644 volk/lib/qa_64f_convert_32f_unaligned16.h delete mode 100644 volk/lib/qa_64f_max_aligned16.cc delete mode 100644 volk/lib/qa_64f_max_aligned16.h delete mode 100644 volk/lib/qa_64f_min_aligned16.cc delete mode 100644 volk/lib/qa_64f_min_aligned16.h delete mode 100644 volk/lib/qa_64u_byteswap_aligned16.cc delete mode 100644 volk/lib/qa_64u_byteswap_aligned16.h delete mode 100644 volk/lib/qa_8s_convert_16s_aligned16.cc delete mode 100644 volk/lib/qa_8s_convert_16s_aligned16.h delete mode 100644 volk/lib/qa_8s_convert_16s_unaligned16.cc delete mode 100644 volk/lib/qa_8s_convert_16s_unaligned16.h delete mode 100644 volk/lib/qa_8s_convert_32f_aligned16.cc delete mode 100644 volk/lib/qa_8s_convert_32f_aligned16.h delete mode 100644 volk/lib/qa_8s_convert_32f_unaligned16.cc delete mode 100644 volk/lib/qa_8s_convert_32f_unaligned16.h delete mode 100644 volk/lib/qa_8sc_deinterleave_16s_aligned16.cc delete mode 100644 volk/lib/qa_8sc_deinterleave_16s_aligned16.h delete mode 100644 volk/lib/qa_8sc_deinterleave_32f_aligned16.cc delete mode 100644 volk/lib/qa_8sc_deinterleave_32f_aligned16.h delete mode 100644 volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc delete mode 100644 volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h delete mode 100644 volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc delete mode 100644 volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h delete mode 100644 volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc delete mode 100644 volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h delete mode 100644 volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc delete mode 100644 volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h delete mode 100644 volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc delete mode 100644 volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h delete mode 100644 volk/lib/qa_volk.cc delete mode 100644 volk/lib/qa_volk.h delete mode 100644 volk/lib/test_all.cc (limited to 'volk') diff --git a/volk/configure.ac b/volk/configure.ac index ebbebb7d2..44a4c4258 100644 --- a/volk/configure.ac +++ b/volk/configure.ac @@ -18,7 +18,6 @@ dnl AC_INIT AC_PREREQ(2.57) AC_CONFIG_AUX_DIR([.]) -AC_CONFIG_SRCDIR([lib/test_all.cc]) AM_CONFIG_HEADER(config.h) AM_INIT_AUTOMAKE(volk,0.1) diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 63df85244..bbc993fa2 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -110,7 +110,6 @@ endif # ---------------------------------------------------------------- noinst_HEADERS = \ volk_init.h \ - qa_volk.h \ qa_utils.h \ assembly.h diff --git a/volk/lib/qa_16s_convert_32f_aligned16.cc b/volk/lib/qa_16s_convert_32f_aligned16.cc deleted file mode 100644 index 6215f4a64..000000000 --- a/volk/lib/qa_16s_convert_32f_aligned16.cc +++ /dev/null @@ -1,74 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE - -void qa_16s_convert_32f_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_16s_convert_32f_aligned16::t1() { - - volk_runtime_init(); - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - int16_t input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_sse[vlen] __attribute__ ((aligned (16))); - float output_sse4_1[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); - } - printf("16s_convert_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16s_convert_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16s_convert_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_16s_convert_32f_aligned16(output_sse4_1, input0, 32768.0, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); - } -} - -#endif diff --git a/volk/lib/qa_16s_convert_32f_aligned16.h b/volk/lib/qa_16s_convert_32f_aligned16.h deleted file mode 100644 index ef813d96f..000000000 --- a/volk/lib/qa_16s_convert_32f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H -#define INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H - -#include -#include - -class qa_16s_convert_32f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_16s_convert_32f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_32f_unaligned16.cc b/volk/lib/qa_16s_convert_32f_unaligned16.cc deleted file mode 100644 index 46c2e48ac..000000000 --- a/volk/lib/qa_16s_convert_32f_unaligned16.cc +++ /dev/null @@ -1,74 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE - -void qa_16s_convert_32f_unaligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_16s_convert_32f_unaligned16::t1() { - - volk_runtime_init(); - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - int16_t input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_sse[vlen] __attribute__ ((aligned (16))); - float output_sse4_1[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); - } - printf("16s_convert_32f_unaligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16s_convert_32f_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16s_convert_32f_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_16s_convert_32f_unaligned16(output_sse4_1, input0, 32768.0, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); - } -} - -#endif diff --git a/volk/lib/qa_16s_convert_32f_unaligned16.h b/volk/lib/qa_16s_convert_32f_unaligned16.h deleted file mode 100644 index aeb04f770..000000000 --- a/volk/lib/qa_16s_convert_32f_unaligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H -#define INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H - -#include -#include - -class qa_16s_convert_32f_unaligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_16s_convert_32f_unaligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_8s_aligned16.cc b/volk/lib/qa_16s_convert_8s_aligned16.cc deleted file mode 100644 index 8225aa0cf..000000000 --- a/volk/lib/qa_16s_convert_8s_aligned16.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_16s_convert_8s_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_16s_convert_8s_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - int16_t input0[vlen] __attribute__ ((aligned (16))); - - int8_t output_generic[vlen] __attribute__ ((aligned (16))); - int8_t output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); - } - printf("16s_convert_8s_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16s_convert_8s_aligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16s_convert_8s_aligned16_manual(output_sse2, input0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d -> %d...%d\n", input0[i], output_generic[i], output_sse2[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); - } -} - -#endif diff --git a/volk/lib/qa_16s_convert_8s_aligned16.h b/volk/lib/qa_16s_convert_8s_aligned16.h deleted file mode 100644 index 2e409d0cc..000000000 --- a/volk/lib/qa_16s_convert_8s_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H -#define INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H - -#include -#include - -class qa_16s_convert_8s_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_16s_convert_8s_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_convert_8s_unaligned16.cc b/volk/lib/qa_16s_convert_8s_unaligned16.cc deleted file mode 100644 index e6ce5030e..000000000 --- a/volk/lib/qa_16s_convert_8s_unaligned16.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_16s_convert_8s_unaligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_16s_convert_8s_unaligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - int16_t input0[vlen] __attribute__ ((aligned (16))); - - int8_t output_generic[vlen] __attribute__ ((aligned (16))); - int8_t output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); - } - printf("16s_convert_8s_unaligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16s_convert_8s_unaligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16s_convert_8s_unaligned16_manual(output_sse2, input0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); - } -} - -#endif diff --git a/volk/lib/qa_16s_convert_8s_unaligned16.h b/volk/lib/qa_16s_convert_8s_unaligned16.h deleted file mode 100644 index 4b2fe9e42..000000000 --- a/volk/lib/qa_16s_convert_8s_unaligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H -#define INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H - -#include -#include - -class qa_16s_convert_8s_unaligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_16s_convert_8s_unaligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H */ diff --git a/volk/lib/qa_16s_max_star_aligned16.cc b/volk/lib/qa_16s_max_star_aligned16.cc deleted file mode 100644 index c6f828ba6..000000000 --- a/volk/lib/qa_16s_max_star_aligned16.cc +++ /dev/null @@ -1,65 +0,0 @@ -#include -#include -#include -#include -#include -//test for ssse3 - -#ifndef LV_HAVE_SSSE3 - -void qa_16s_max_star_aligned16::t1() { - printf("ssse3 not available... no test performed\n"); -} - -#else - - - -void qa_16s_max_star_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 6400; - const int ITERS = 100000; - short input0[vlen] __attribute__ ((aligned (16))); - short output0[1] __attribute__ ((aligned (16))); - - short output1[1] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; - - short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; - - input0[i] = plus0 - minus0; - - } - printf("16s_max_star_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16s_max_star_aligned16_manual(output0, input0, vlen << 1, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16s_max_star_aligned16_manual(output1, input0, vlen << 1, "ssse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("ssse3_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < 1; ++i) { - - CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); - } -} - -#endif diff --git a/volk/lib/qa_16s_max_star_aligned16.h b/volk/lib/qa_16s_max_star_aligned16.h deleted file mode 100644 index 119f87c4d..000000000 --- a/volk/lib/qa_16s_max_star_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H -#define INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H - -#include -#include - -class qa_16s_max_star_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_16s_max_star_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H */ diff --git a/volk/lib/qa_16s_max_star_horizontal_aligned16.cc b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc deleted file mode 100644 index 0a58570e2..000000000 --- a/volk/lib/qa_16s_max_star_horizontal_aligned16.cc +++ /dev/null @@ -1,79 +0,0 @@ -#include -#include -#include -#include -#include -#include -//test for ssse3 - -#ifndef LV_HAVE_SSSE3 - -void qa_16s_max_star_horizontal_aligned16::t1() { - printf("ssse3 not available... no test performed\n"); -} - -#else - - -void qa_16s_max_star_horizontal_aligned16::t1() { - - - volk_runtime_init(); - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 32; - const int ITERS = 1; - short input0[vlen] __attribute__ ((aligned (16))); - short output0[vlen>>1] __attribute__ ((aligned (16))); - - short output1[vlen>>1] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - short plus0 = ((short) (rand() - (RAND_MAX/2))); - - short minus0 = ((short) (rand() - (RAND_MAX/2))); - - input0[i] = plus0 - minus0; - - } - printf("16s_max_star_horizontal_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16s_max_star_horizontal_aligned16_manual(output0, input0, 2*vlen, "generic"); - volk_16s_max_star_horizontal_aligned16_manual(output0, output0, vlen, "generic"); - volk_16s_max_star_horizontal_aligned16_manual(output0, output0, vlen/2, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - - get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, input0, 2*vlen); - get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, output1, vlen); - get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, output1, vlen); - /* volk_16s_max_star_horizontal_aligned16(output1, input0, 2*vlen, "ssse3"); - volk_16s_max_star_horizontal_aligned16(output1, output1, vlen, "ssse3"); - volk_16s_max_star_horizontal_aligned16(output1, output1, vlen, "ssse3");*/ - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("ssse3_time: %f\n", total); - - for(int i = 0; i < (vlen >> 1); ++i) { - // printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - - } - for(int i = 0; i < (vlen >> 1); ++i) { - - CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); - } - } - - -#endif - diff --git a/volk/lib/qa_16s_max_star_horizontal_aligned16.h b/volk/lib/qa_16s_max_star_horizontal_aligned16.h deleted file mode 100644 index 9f9757253..000000000 --- a/volk/lib/qa_16s_max_star_horizontal_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H -#define INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H - -#include -#include - -class qa_16s_max_star_horizontal_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_16s_max_star_horizontal_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc deleted file mode 100644 index aadc39067..000000000 --- a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc +++ /dev/null @@ -1,89 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSSE3 - -void qa_16sc_deinterleave_16s_aligned16::t1() { - printf("ssse3 not available... no test performed\n"); -} - -#else - -void qa_16sc_deinterleave_16s_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - int16_t output_generic[vlen] __attribute__ ((aligned (16))); - int16_t output_generic1[vlen] __attribute__ ((aligned (16))); - int16_t output_sse2[vlen] __attribute__ ((aligned (16))); - int16_t output_sse21[vlen] __attribute__ ((aligned (16))); - int16_t output_orc[vlen] __attribute__ ((aligned (16))); - int16_t output_orc1[vlen] __attribute__ ((aligned (16))); - int16_t output_ssse3[vlen] __attribute__ ((aligned (16))); - int16_t output_ssse31[vlen] __attribute__ ((aligned (16))); - - int16_t* loadInput = (int16_t*)input0; - for(int i = 0; i < vlen*2; ++i) { - loadInput[i] = ((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32678.0)); - } - printf("16sc_deinterleave_16s_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_16s_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_16s_aligned16_manual(output_orc, output_orc1, input0, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_16s_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_16s_aligned16_manual(output_ssse3, output_ssse31, input0, vlen, "ssse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("ssse3_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); - CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_sse21[i]); - - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]); - CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_ssse31[i]); - - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_orc[i]); - CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_orc1[i]); - } -} - -#endif diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.h b/volk/lib/qa_16sc_deinterleave_16s_aligned16.h deleted file mode 100644 index 995ab5b34..000000000 --- a/volk/lib/qa_16sc_deinterleave_16s_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H -#define INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H - -#include -#include - -class qa_16sc_deinterleave_16s_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_16s_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc deleted file mode 100644 index 13151be13..000000000 --- a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc +++ /dev/null @@ -1,75 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE2 - -void qa_16sc_deinterleave_32f_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_16sc_deinterleave_32f_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_generic1[vlen] __attribute__ ((aligned (16))); - float output_sse2[vlen] __attribute__ ((aligned (16))); - float output_sse21[vlen] __attribute__ ((aligned (16))); - float output_orc[vlen] __attribute__ ((aligned (16))); - float output_orc1[vlen] __attribute__ ((aligned (16))); - - int16_t* loadInput = (int16_t*)input0; - for(int i = 0; i < vlen*2; ++i) { - loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0)); - } - printf("16sc_deinterleave_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_32f_aligned16_manual(output_orc, output_orc1, input0, 32768.0, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_32f_aligned16_manual(output_sse2, output_sse21, input0, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse21[i], fabs(output_generic1[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_orc1[i], fabs(output_generic1[i])*1e-4); - } -} - -#endif diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.h b/volk/lib/qa_16sc_deinterleave_32f_aligned16.h deleted file mode 100644 index fea3b6c2d..000000000 --- a/volk/lib/qa_16sc_deinterleave_32f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H -#define INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H - -#include -#include - -class qa_16sc_deinterleave_32f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_32f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc deleted file mode 100644 index c67064ea6..000000000 --- a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc +++ /dev/null @@ -1,72 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSSE3 - -void qa_16sc_deinterleave_real_16s_aligned16::t1() { - printf("ssse3 not available... no test performed\n"); -} - -#else - -void qa_16sc_deinterleave_real_16s_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - int16_t output_generic[vlen] __attribute__ ((aligned (16))); - int16_t output_sse2[vlen] __attribute__ ((aligned (16))); - int16_t output_ssse3[vlen] __attribute__ ((aligned (16))); - - int16_t* loadInput = (int16_t*)input0; - for(int i = 0; i < vlen*2; ++i) { - loadInput[i] = ((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32678.0)); - } - printf("16sc_deinterleave_real_16s_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_real_16s_aligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_real_16s_aligned16_manual(output_sse2, input0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_real_16s_aligned16_manual(output_ssse3, input0, vlen, "ssse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("ssse3_time: %f\n", total); - - for(int i = 0; i < vlen; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - // printf("%d = generic... %d, sse2... %d, ssse3... %d\n", i, output_generic[i], output_sse2[i], output_ssse3[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_ssse3[i], fabs(output_generic[i])*1e-4); - } -} - -#endif diff --git a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h deleted file mode 100644 index ebb70b97a..000000000 --- a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H -#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H - -#include -#include - -class qa_16sc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_16s_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc deleted file mode 100644 index f86f03b88..000000000 --- a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc +++ /dev/null @@ -1,124 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE4_1 - -#ifndef LV_HAVE_SSE - -void qa_16sc_deinterleave_real_32f_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_16sc_deinterleave_real_32f_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_sse[vlen] __attribute__ ((aligned (16))); - - int16_t* loadInput = (int16_t*)input0; - for(int i = 0; i < vlen*2; ++i) { - loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0)); - } - printf("16sc_deinterleave_real_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); - } -} - -#endif /* SSE */ - -#else - -void qa_16sc_deinterleave_real_32f_aligned16::t1() { - - volk_runtime_init(); - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_sse[vlen] __attribute__ ((aligned (16))); - float output_sse4_1[vlen] __attribute__ ((aligned (16))); - - int16_t* loadInput = (int16_t*)input0; - for(int i = 0; i < vlen*2; ++i) { - loadInput[i] =((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0); - } - printf("16sc_deinterleave_real_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_16sc_deinterleave_real_32f_aligned16(output_sse4_1, input0, 32768.0, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4); - } -} - -#endif /* SSE4_1 */ diff --git a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h deleted file mode 100644 index e83426473..000000000 --- a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H -#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H - -#include -#include - -class qa_16sc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_32f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc deleted file mode 100644 index 803caaa2d..000000000 --- a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc +++ /dev/null @@ -1,70 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSSE3 - -void qa_16sc_deinterleave_real_8s_aligned16::t1() { - printf("ssse3 not available... no test performed\n"); -} - -#else - -void qa_16sc_deinterleave_real_8s_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - int8_t output_generic[vlen] __attribute__ ((aligned (16))); - int8_t output_ssse3[vlen] __attribute__ ((aligned (16))); - int8_t output_orc[vlen] __attribute__ ((aligned (16))); - - int16_t* loadInput = (int16_t*)input0; - for(int i = 0; i < vlen*2; ++i) { - loadInput[i] =((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0); - } - printf("16sc_deinterleave_real_8s_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_real_8s_aligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_real_8s_aligned16_manual(output_orc, input0, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("ssse3_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_orc[i]); - } -} - -#endif diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h deleted file mode 100644 index 04e5511e5..000000000 --- a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H -#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H - -#include -#include - -class qa_16sc_deinterleave_real_8s_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_8s_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc deleted file mode 100644 index 7fbdd8620..000000000 --- a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc +++ /dev/null @@ -1,81 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE3 - -void qa_16sc_magnitude_16s_aligned16::t1() { - printf("sse3 not available... no test performed\n"); -} - -#else - -void qa_16sc_magnitude_16s_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - int16_t output_generic[vlen] __attribute__ ((aligned (16))); - int16_t output_orc[vlen] __attribute__ ((aligned (16))); - int16_t output_sse[vlen] __attribute__ ((aligned (16))); - int16_t output_sse3[vlen] __attribute__ ((aligned (16))); - - int16_t* loadInput = (int16_t*)input0; - for(int i = 0; i < vlen*2; ++i) { - loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 32768.0)); - } - printf("16sc_magnitude_16s_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_magnitude_16s_aligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_magnitude_16s_aligned16_manual(output_orc, input0, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_magnitude_16s_aligned16_manual(output_sse, input0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_magnitude_16s_aligned16_manual(output_sse3, input0, vlen, "sse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], 1.1); - } -} - -#endif diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.h b/volk/lib/qa_16sc_magnitude_16s_aligned16.h deleted file mode 100644 index 4664b70f4..000000000 --- a/volk/lib/qa_16sc_magnitude_16s_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H -#define INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H - -#include -#include - -class qa_16sc_magnitude_16s_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_16sc_magnitude_16s_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc deleted file mode 100644 index 54cc2ba6e..000000000 --- a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc +++ /dev/null @@ -1,131 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE3 - -void qa_16sc_magnitude_32f_aligned16::t1() { - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 10000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_orc[vlen] __attribute__ ((aligned (16))); - float output_known[vlen] __attribute__ ((aligned (16))); - - int16_t* inputLoad = (int16_t*)input0; - for(int i = 0; i < 2*vlen; ++i) { - inputLoad[i] = (int16_t)(rand() - (RAND_MAX/2)); - } - printf("16sc_magnitude_32f_aligned\n"); - - float scale = 32768.0; - for(int i = 0; i < vlen; ++i) { - float re = (float)(input0[i].real())/scale; - float im = (float)(input0[i].imag())/scale; - output_known[i] = sqrt(re*re + im*im); - } - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_magnitude_32f_aligned16_manual(output_generic, input0, scale, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_magnitude_32f_aligned16_manual(output_orc, input0, scale, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - - /* - for(int i = 0; i < 100; ++i) { - printf("inputs: %d + j%d\n", input0[i].real(), input0[i].imag()); - printf("generic... %f == %f\n", output_generic[i], output_known[i]); - } - */ - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_known[i], fabs(output_generic[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_orc[i], output_known[i], fabs(output_generic[i])*1e-4); - } -} - -#else - -void qa_16sc_magnitude_32f_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_orc[vlen] __attribute__ ((aligned (16))); - float output_sse[vlen] __attribute__ ((aligned (16))); - float output_sse3[vlen] __attribute__ ((aligned (16))); - - int16_t* inputLoad = (int16_t*)input0; - for(int i = 0; i < 2*vlen; ++i) { - inputLoad[i] = (int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); - } - printf("16sc_magnitude_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_magnitude_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); -/* start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_magnitude_32f_aligned16_manual(output_orc, input0, 32768.0, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); -*/ - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_magnitude_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16sc_magnitude_32f_aligned16_manual(output_sse3, input0, 32768.0, vlen, "sse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4); -// CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4); - } -} - -#endif diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.h b/volk/lib/qa_16sc_magnitude_32f_aligned16.h deleted file mode 100644 index 0c25673ea..000000000 --- a/volk/lib/qa_16sc_magnitude_32f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H -#define INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H - -#include -#include - -class qa_16sc_magnitude_32f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_16sc_magnitude_32f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_16u_byteswap_aligned16.cc b/volk/lib/qa_16u_byteswap_aligned16.cc deleted file mode 100644 index c2295968b..000000000 --- a/volk/lib/qa_16u_byteswap_aligned16.cc +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE2 - -void qa_16u_byteswap_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_16u_byteswap_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100001; - - uint16_t output0[vlen] __attribute__ ((aligned (16))); - uint16_t output01[vlen] __attribute__ ((aligned (16))); - uint16_t output02[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - output0[i] = (uint16_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2)); - } - memcpy(output01, output0, vlen*sizeof(uint16_t)); - memcpy(output02, output0, vlen*sizeof(uint16_t)); - - printf("16u_byteswap_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16u_byteswap_aligned16_manual(output0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16u_byteswap_aligned16_manual(output02, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_16u_byteswap_aligned16_manual(output01, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); - } -} - -#endif diff --git a/volk/lib/qa_16u_byteswap_aligned16.h b/volk/lib/qa_16u_byteswap_aligned16.h deleted file mode 100644 index e11b23e3f..000000000 --- a/volk/lib/qa_16u_byteswap_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H -#define INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H - -#include -#include - -class qa_16u_byteswap_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_16u_byteswap_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_accumulator_aligned16.cc b/volk/lib/qa_32f_accumulator_aligned16.cc deleted file mode 100644 index 0defef283..000000000 --- a/volk/lib/qa_32f_accumulator_aligned16.cc +++ /dev/null @@ -1,57 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32f_accumulator_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32f_accumulator_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - - float accumulator_generic; - float accumulator_sse; - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_accumulator_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_accumulator_aligned16_manual(&accumulator_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_accumulator_aligned16_manual(&accumulator_sse, input0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(accumulator_generic, accumulator_sse, fabs(accumulator_generic)*1e-4); -} - -#endif diff --git a/volk/lib/qa_32f_accumulator_aligned16.h b/volk/lib/qa_32f_accumulator_aligned16.h deleted file mode 100644 index 0004d3ff0..000000000 --- a/volk/lib/qa_32f_accumulator_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H -#define INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H - -#include -#include - -class qa_32f_accumulator_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_accumulator_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_add_aligned16.cc b/volk/lib/qa_32f_add_aligned16.cc deleted file mode 100644 index a183d4d85..000000000 --- a/volk/lib/qa_32f_add_aligned16.cc +++ /dev/null @@ -1,123 +0,0 @@ -/* -*- c++ -*- */ -/* - * Copyright 2010 Free Software Foundation, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Radio; see the file COPYING. If not, see - * . - */ - -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32f_add_aligned16::t1() { - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 10000; - float input0[vlen] __attribute__ ((aligned (16))); - float input1[vlen] __attribute__ ((aligned (16))); - - float output0[vlen] __attribute__ ((aligned (16))); - float output_known[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - output_known[i] = input0[i] + input1[i]; - } - printf("32f_add_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_add_aligned16_manual(output0, input0, input1, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - /* - for(int i = 0; i < 10; ++i) { - printf("inputs: %f, %f\n", input0[i], input1[i]); - printf("generic... %f == %f\n", output0[i], output_known[i]); - } - */ - - for(int i = 0; i < vlen; ++i) { - CPPUNIT_ASSERT_EQUAL(output0[i], output_known[i]); - } -} - -#else - -void qa_32f_add_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - float input1[vlen] __attribute__ ((aligned (16))); - - float output0[vlen] __attribute__ ((aligned (16))); - float output01[vlen] __attribute__ ((aligned (16))); - float output02[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_add_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_add_aligned16_manual(output0, input0, input1, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_add_aligned16_manual(output02, input0, input1, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_add_aligned16_manual(output01, input0, input1, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); - } -} - -#endif diff --git a/volk/lib/qa_32f_add_aligned16.h b/volk/lib/qa_32f_add_aligned16.h deleted file mode 100644 index 58e2a151c..000000000 --- a/volk/lib/qa_32f_add_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_ADD_ALIGNED16_H -#define INCLUDED_QA_32F_ADD_ALIGNED16_H - -#include -#include - -class qa_32f_add_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_add_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_ADD_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc deleted file mode 100644 index 5d6987333..000000000 --- a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc +++ /dev/null @@ -1,60 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32f_calc_spectral_noise_floor_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32f_calc_spectral_noise_floor_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - - float output0[1] __attribute__ ((aligned (16))); - float output01[1] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_calc_spectral_noise_floor_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_calc_spectral_noise_floor_aligned16_manual(output0, input0, 20, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_calc_spectral_noise_floor_aligned16_manual(output01, input0, 20, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < 1; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4); - } -} - -#endif diff --git a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h deleted file mode 100644 index c5dce2c4b..000000000 --- a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H -#define INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H - -#include -#include - -class qa_32f_calc_spectral_noise_floor_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_calc_spectral_noise_floor_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_16s_aligned16.cc b/volk/lib/qa_32f_convert_16s_aligned16.cc deleted file mode 100644 index 3e2452e68..000000000 --- a/volk/lib/qa_32f_convert_16s_aligned16.cc +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_32f_convert_16s_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_32f_convert_16s_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - - int16_t output_generic[vlen] __attribute__ ((aligned (16))); - int16_t output_sse[vlen] __attribute__ ((aligned (16))); - int16_t output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_convert_16s_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_16s_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < vlen; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("%d generic... %d, sse... %d sse2... %d\n", i, output_generic[i], output_sse[i], output_sse2[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); - CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); - } -} - -#endif diff --git a/volk/lib/qa_32f_convert_16s_aligned16.h b/volk/lib/qa_32f_convert_16s_aligned16.h deleted file mode 100644 index fce1eb417..000000000 --- a/volk/lib/qa_32f_convert_16s_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H -#define INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H - -#include -#include - -class qa_32f_convert_16s_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_convert_16s_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_16s_unaligned16.cc b/volk/lib/qa_32f_convert_16s_unaligned16.cc deleted file mode 100644 index e016b7ff7..000000000 --- a/volk/lib/qa_32f_convert_16s_unaligned16.cc +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_32f_convert_16s_unaligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_32f_convert_16s_unaligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - - int16_t output_generic[vlen] __attribute__ ((aligned (16))); - int16_t output_sse[vlen] __attribute__ ((aligned (16))); - int16_t output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_convert_16s_unaligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_16s_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_16s_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_16s_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); - CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); - } -} - -#endif diff --git a/volk/lib/qa_32f_convert_16s_unaligned16.h b/volk/lib/qa_32f_convert_16s_unaligned16.h deleted file mode 100644 index 492bc80e6..000000000 --- a/volk/lib/qa_32f_convert_16s_unaligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H -#define INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H - -#include -#include - -class qa_32f_convert_16s_unaligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_convert_16s_unaligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_32s_aligned16.cc b/volk/lib/qa_32f_convert_32s_aligned16.cc deleted file mode 100644 index abceb52fb..000000000 --- a/volk/lib/qa_32f_convert_32s_aligned16.cc +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_32f_convert_32s_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_32f_convert_32s_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - - int32_t output_generic[vlen] __attribute__ ((aligned (16))); - int32_t output_sse[vlen] __attribute__ ((aligned (16))); - int32_t output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_convert_32s_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_32s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_32s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_32s_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); - CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); - } -} - -#endif diff --git a/volk/lib/qa_32f_convert_32s_aligned16.h b/volk/lib/qa_32f_convert_32s_aligned16.h deleted file mode 100644 index 97d854463..000000000 --- a/volk/lib/qa_32f_convert_32s_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H -#define INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H - -#include -#include - -class qa_32f_convert_32s_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_convert_32s_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_32s_unaligned16.cc b/volk/lib/qa_32f_convert_32s_unaligned16.cc deleted file mode 100644 index 90f84b56f..000000000 --- a/volk/lib/qa_32f_convert_32s_unaligned16.cc +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_32f_convert_32s_unaligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_32f_convert_32s_unaligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - - int32_t output_generic[vlen] __attribute__ ((aligned (16))); - int32_t output_sse[vlen] __attribute__ ((aligned (16))); - int32_t output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_convert_32s_unaligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_32s_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_32s_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_32s_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); - CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); - } -} - -#endif diff --git a/volk/lib/qa_32f_convert_32s_unaligned16.h b/volk/lib/qa_32f_convert_32s_unaligned16.h deleted file mode 100644 index 5d662d86d..000000000 --- a/volk/lib/qa_32f_convert_32s_unaligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H -#define INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H - -#include -#include - -class qa_32f_convert_32s_unaligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_convert_32s_unaligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_64f_aligned16.cc b/volk/lib/qa_32f_convert_64f_aligned16.cc deleted file mode 100644 index 1d0754ac9..000000000 --- a/volk/lib/qa_32f_convert_64f_aligned16.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_32f_convert_64f_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_32f_convert_64f_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - - double output_generic[vlen] __attribute__ ((aligned (16))); - double output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_convert_64f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_64f_aligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_64f_aligned16_manual(output_sse2, input0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i] ,output_sse2[i], fabs(output_generic[i])*1e-6); - } -} - -#endif diff --git a/volk/lib/qa_32f_convert_64f_aligned16.h b/volk/lib/qa_32f_convert_64f_aligned16.h deleted file mode 100644 index 41eb3e094..000000000 --- a/volk/lib/qa_32f_convert_64f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H -#define INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H - -#include -#include - -class qa_32f_convert_64f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_convert_64f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_64f_unaligned16.cc b/volk/lib/qa_32f_convert_64f_unaligned16.cc deleted file mode 100644 index 6f7d5066d..000000000 --- a/volk/lib/qa_32f_convert_64f_unaligned16.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_32f_convert_64f_unaligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_32f_convert_64f_unaligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - - double output_generic[vlen] __attribute__ ((aligned (16))); - double output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_convert_64f_unaligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_64f_unaligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_64f_unaligned16_manual(output_sse2, input0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); - } -} - -#endif diff --git a/volk/lib/qa_32f_convert_64f_unaligned16.h b/volk/lib/qa_32f_convert_64f_unaligned16.h deleted file mode 100644 index 4b144f033..000000000 --- a/volk/lib/qa_32f_convert_64f_unaligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H -#define INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H - -#include -#include - -class qa_32f_convert_64f_unaligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_convert_64f_unaligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_8s_aligned16.cc b/volk/lib/qa_32f_convert_8s_aligned16.cc deleted file mode 100644 index 6a53629b5..000000000 --- a/volk/lib/qa_32f_convert_8s_aligned16.cc +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_32f_convert_8s_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_32f_convert_8s_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - - int8_t output_generic[vlen] __attribute__ ((aligned (16))); - int8_t output_sse[vlen] __attribute__ ((aligned (16))); - int8_t output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_convert_8s_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_8s_aligned16_manual(output_generic, input0, 128.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_8s_aligned16_manual(output_sse, input0, 128.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_8s_aligned16_manual(output_sse2, input0, 128.0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); - CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); - } -} - -#endif diff --git a/volk/lib/qa_32f_convert_8s_aligned16.h b/volk/lib/qa_32f_convert_8s_aligned16.h deleted file mode 100644 index 68a523f34..000000000 --- a/volk/lib/qa_32f_convert_8s_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H -#define INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H - -#include -#include - -class qa_32f_convert_8s_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_convert_8s_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_convert_8s_unaligned16.cc b/volk/lib/qa_32f_convert_8s_unaligned16.cc deleted file mode 100644 index fbc5c20e6..000000000 --- a/volk/lib/qa_32f_convert_8s_unaligned16.cc +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_32f_convert_8s_unaligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_32f_convert_8s_unaligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - - int8_t output_generic[vlen] __attribute__ ((aligned (16))); - int8_t output_sse[vlen] __attribute__ ((aligned (16))); - int8_t output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_convert_8s_unaligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_8s_unaligned16_manual(output_generic, input0, 128.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_8s_unaligned16_manual(output_sse, input0, 128.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_convert_8s_unaligned16_manual(output_sse2, input0, 128.0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1); - CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1); - } -} - -#endif diff --git a/volk/lib/qa_32f_convert_8s_unaligned16.h b/volk/lib/qa_32f_convert_8s_unaligned16.h deleted file mode 100644 index 88d4ff42a..000000000 --- a/volk/lib/qa_32f_convert_8s_unaligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H -#define INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H - -#include -#include - -class qa_32f_convert_8s_unaligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_convert_8s_unaligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc deleted file mode 100644 index f2a1b9e7f..000000000 --- a/volk/lib/qa_32f_divide_aligned16.cc +++ /dev/null @@ -1,133 +0,0 @@ -/* -*- c++ -*- */ -/* - * Copyright 2010 Free Software Foundation, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Radio; see the file COPYING. If not, see - * . - */ - -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32f_divide_aligned16::t1() { - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 10000; - float input0[vlen] __attribute__ ((aligned (16))); - float input1[vlen] __attribute__ ((aligned (16))); - - float output0[vlen] __attribute__ ((aligned (16))); - float output1[vlen] __attribute__ ((aligned (16))); - float output_known[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - output_known[i] = input0[i] / input1[i]; - } - printf("32f_divide_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_divide_aligned16_manual(output0, input0, input1, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_divide_aligned16_manual(output1, input0, input1, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - - /* - for(int i = 0; i < 10; ++i) { - printf("inputs: %f, %f\n", input0[i], input1[i]); - printf("generic... %f == %f\n", output0[i], output_known[i]); - } - */ - - for(int i = 0; i < vlen; ++i) { - CPPUNIT_ASSERT_EQUAL(output0[i], output_known[i]); - CPPUNIT_ASSERT_EQUAL(output1[i], output_known[i]); - } -} - -#else - -void qa_32f_divide_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - float input1[vlen] __attribute__ ((aligned (16))); - - float output0[vlen] __attribute__ ((aligned (16))); - float output01[vlen] __attribute__ ((aligned (16))); - float output02[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_divide_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_divide_aligned16_manual(output0, input0, input1, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_divide_aligned16_manual(output02, input0, input1, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_divide_aligned16_manual(output01, input0, input1, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); - } -} - -#endif diff --git a/volk/lib/qa_32f_divide_aligned16.h b/volk/lib/qa_32f_divide_aligned16.h deleted file mode 100644 index 79d5ae4b8..000000000 --- a/volk/lib/qa_32f_divide_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_DIVIDE_ALIGNED16_H -#define INCLUDED_QA_32F_DIVIDE_ALIGNED16_H - -#include -#include - -class qa_32f_divide_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_divide_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_DIVIDE_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_dot_prod_aligned16.cc b/volk/lib/qa_32f_dot_prod_aligned16.cc deleted file mode 100644 index 98c1f2d99..000000000 --- a/volk/lib/qa_32f_dot_prod_aligned16.cc +++ /dev/null @@ -1,183 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#define ERR_DELTA (1e-4) - -//test for sse -static float uniform() { - return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - for (unsigned i = 0; i < n; i++) - buf[i] = uniform (); -} - -#ifndef LV_HAVE_SSE4_1 - -#ifdef LV_HAVE_SSE3 -void qa_32f_dot_prod_aligned16::t1() { - const int vlen = 2046; - const int ITER = 100000; - - int i; - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - float * input; - float * taps; - - float * result_generic; - float * result_sse; - float * result_sse3; - - ret = posix_memalign((void**)&input, 16, vlen* sizeof(float)); - ret = posix_memalign((void**)&taps, 16, vlen *sizeof(float)); - ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); - ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); - ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); - - random_floats((float*)input, vlen); - random_floats((float*)taps, vlen); - - - printf("32f_dot_prod_aligned16\n"); - - start = clock(); - for(i = 0; i < ITER; i++){ - volk_32f_dot_prod_aligned16_manual(&result_generic[i], input, taps, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(i = 0; i < ITER; i++){ - volk_32f_dot_prod_aligned16_manual(&result_sse[i], input, taps, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(i = 0; i < ITER; i++){ - volk_32f_dot_prod_aligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3_time: %f\n", total); - - printf("generic: %f ... sse: %f ... sse3 %f \n", result_generic[0], result_sse[0], result_sse3[0]); - - for(i = 0; i < ITER; i++){ - CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); - CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); - } - - free(input); - free(taps); - free(result_generic); - free(result_sse); - free(result_sse3); - -} -#else -void qa_32f_dot_prod_aligned16::t1() { - printf("sse3 not available... no test performed\n"); -} - -#endif /* LV_HAVE_SSE3 */ - -#else - -void qa_32f_dot_prod_aligned16::t1() { - - - volk_runtime_init(); - - const int vlen = 4095; - const int ITER = 100000; - - int i; - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - float * input; - float * taps; - - float * result_generic; - float * result_sse; - float * result_sse3; - float * result_sse4_1; - - ret = posix_memalign((void**)&input, 16, vlen * sizeof(float)); - ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float)); - ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); - ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); - ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); - ret = posix_memalign((void**)&result_sse4_1, 16, ITER*sizeof(float)); - - random_floats((float*)input, vlen); - random_floats((float*)taps, vlen); - - printf("32f_dot_prod_aligned16\n"); - - start = clock(); - for(i = 0; i < ITER; i++){ - volk_32f_dot_prod_aligned16_manual(&result_generic[i], input, taps, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(i = 0; i < ITER; i++){ - volk_32f_dot_prod_aligned16_manual(&result_sse[i], input, taps, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(i = 0; i < ITER; i++){ - volk_32f_dot_prod_aligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3_time: %f\n", total); - - start = clock(); - for(i = 0; i < ITER; i++){ - get_volk_runtime()->volk_32f_dot_prod_aligned16(&result_sse4_1[i], input, taps, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - //printf("generic: %f ... sse: %f ... sse3 %f ... sse4_1 %f \n", result_generic[0], result_sse[0], result_sse3[0], result_sse4_1[0]); - for(i =0; i < ITER; i++){ - CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); - CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); - CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse4_1[i], fabs(result_generic[i])*ERR_DELTA); - } - - free(input); - free(taps); - free(result_generic); - free(result_sse); - free(result_sse3); - free(result_sse4_1); - -} - -#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32f_dot_prod_aligned16.h b/volk/lib/qa_32f_dot_prod_aligned16.h deleted file mode 100644 index 6931a9e98..000000000 --- a/volk/lib/qa_32f_dot_prod_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H -#define INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H - -#include -#include - -class qa_32f_dot_prod_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_dot_prod_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_dot_prod_unaligned16.cc b/volk/lib/qa_32f_dot_prod_unaligned16.cc deleted file mode 100644 index 8e97d4249..000000000 --- a/volk/lib/qa_32f_dot_prod_unaligned16.cc +++ /dev/null @@ -1,190 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#define ERR_DELTA (1e-4) - -//test for sse -static float uniform() { - return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - for (unsigned i = 0; i < n; i++) - buf[i] = uniform (); -} - -#ifndef LV_HAVE_SSE4_1 - -#ifdef LV_HAVE_SSE3 -void qa_32f_dot_prod_unaligned16::t1() { - - - volk_runtime_init(); - - const int vlen = 2046; - const int ITER = 100000; - - int i; - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - float * input; - float * taps; - - float * result_generic; - float * result_sse; - float * result_sse3; - - ret = posix_memalign((void**)&input, 16, vlen* sizeof(float)); - ret = posix_memalign((void**)&taps, 16, vlen *sizeof(float)); - ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); - ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); - ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); - - random_floats((float*)input, vlen); - random_floats((float*)taps, vlen); - - - printf("32f_dot_prod_unaligned16\n"); - - start = clock(); - for(i = 0; i < ITER; i++){ - volk_32f_dot_prod_unaligned16_manual(&result_generic[i], input, taps, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(i = 0; i < ITER; i++){ - volk_32f_dot_prod_unaligned16_manual(&result_sse[i], input, taps, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(i = 0; i < ITER; i++){ - volk_32f_dot_prod_unaligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3_time: %f\n", total); - - printf("generic: %f ... sse: %f ... sse3 %f \n", result_generic[0], result_sse[0], result_sse3[0]); - - for(i = 0; i < ITER; i++){ - CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); - CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); - } - - free(input); - free(taps); - free(result_generic); - free(result_sse); - free(result_sse3); - -} -#else -void qa_32f_dot_prod_unaligned16::t1() { - printf("sse3 not available... no test performed\n"); -} - -#endif /* LV_HAVE_SSE3 */ - -#else - -void qa_32f_dot_prod_unaligned16::t1() { - - - volk_runtime_init(); - - const int vlen = 4095; - const int ITER = 100000; - - int i; - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - float * input; - float * taps; - - float * result_generic; - float * result_sse; - float * result_sse3; - float * result_sse4_1; - - ret = posix_memalign((void**)&input, 16, (vlen+1) * sizeof(float)); - ret = posix_memalign((void**)&taps, 16, (vlen+1) * sizeof(float)); - ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float)); - ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float)); - ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float)); - ret = posix_memalign((void**)&result_sse4_1, 16, ITER*sizeof(float)); - - input = &input[1]; // Make sure the buffer is unaligned - taps = &taps[1]; // Make sure the buffer is unaligned - - random_floats((float*)input, vlen); - random_floats((float*)taps, vlen); - - printf("32f_dot_prod_unaligned16\n"); - - start = clock(); - for(i = 0; i < ITER; i++){ - volk_32f_dot_prod_unaligned16_manual(&result_generic[i], input, taps, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(i = 0; i < ITER; i++){ - volk_32f_dot_prod_unaligned16_manual(&result_sse[i], input, taps, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(i = 0; i < ITER; i++){ - volk_32f_dot_prod_unaligned16_manual(&result_sse3[i], input, taps, vlen, "sse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3_time: %f\n", total); - - start = clock(); - for(i = 0; i < ITER; i++){ - get_volk_runtime()->volk_32f_dot_prod_unaligned16(&result_sse4_1[i], input, taps, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - //printf("generic: %f ... sse: %f ... sse3 %f ... sse4_1 %f \n", result_generic[0], result_sse[0], result_sse3[0], result_sse4_1[0]); - for(i =0; i < ITER; i++){ - CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA); - CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA); - CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse4_1[i], fabs(result_generic[i])*ERR_DELTA); - } - - free(&input[-1]); - free(&taps[-1]); - free(result_generic); - free(result_sse); - free(result_sse3); - free(result_sse4_1); - -} - -#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32f_dot_prod_unaligned16.h b/volk/lib/qa_32f_dot_prod_unaligned16.h deleted file mode 100644 index e8bad07fe..000000000 --- a/volk/lib/qa_32f_dot_prod_unaligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H -#define INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H - -#include -#include - -class qa_32f_dot_prod_unaligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_dot_prod_unaligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H */ diff --git a/volk/lib/qa_32f_interleave_16sc_aligned16.cc b/volk/lib/qa_32f_interleave_16sc_aligned16.cc deleted file mode 100644 index a7ae60780..000000000 --- a/volk/lib/qa_32f_interleave_16sc_aligned16.cc +++ /dev/null @@ -1,76 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE2 - -void qa_32f_interleave_16sc_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_32f_interleave_16sc_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - float input1[vlen] __attribute__ ((aligned (16))); - - std::complex output_generic[vlen] __attribute__ ((aligned (16))); - std::complex output_sse[vlen] __attribute__ ((aligned (16))); - std::complex output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_interleave_16sc_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_interleave_16sc_aligned16_manual(output_generic, input0, input1, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_interleave_16sc_aligned16_manual(output_sse, input0, input1, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_interleave_16sc_aligned16_manual(output_sse2, input0, input1, 32768.0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse[i]), 1.01); - CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse[i]), 1.01); - - CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse2[i]), 1.01); - CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse2[i]), 1.01); - } -} - -#endif diff --git a/volk/lib/qa_32f_interleave_16sc_aligned16.h b/volk/lib/qa_32f_interleave_16sc_aligned16.h deleted file mode 100644 index 8d2914817..000000000 --- a/volk/lib/qa_32f_interleave_16sc_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H -#define INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H - -#include -#include - -class qa_32f_interleave_16sc_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_interleave_16sc_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_interleave_32fc_aligned16.cc b/volk/lib/qa_32f_interleave_32fc_aligned16.cc deleted file mode 100644 index 333b6fce8..000000000 --- a/volk/lib/qa_32f_interleave_32fc_aligned16.cc +++ /dev/null @@ -1,63 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32f_interleave_32fc_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32f_interleave_32fc_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - float input1[vlen] __attribute__ ((aligned (16))); - - std::complex output_generic[vlen] __attribute__ ((aligned (16))); - std::complex output_sse[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_interleave_32fc_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_interleave_32fc_aligned16_manual(output_generic, input0, input1, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_interleave_32fc_aligned16_manual(output_sse, input0, input1, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse[i]), fabs(std::real(output_generic[i]))*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse[i]), fabs(std::imag(output_generic[i]))*1e-4); - } -} - -#endif diff --git a/volk/lib/qa_32f_interleave_32fc_aligned16.h b/volk/lib/qa_32f_interleave_32fc_aligned16.h deleted file mode 100644 index cba518d37..000000000 --- a/volk/lib/qa_32f_interleave_32fc_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H -#define INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H - -#include -#include - -class qa_32f_interleave_32fc_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_interleave_32fc_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_max_aligned16.cc b/volk/lib/qa_32f_max_aligned16.cc deleted file mode 100644 index 98f8ce9bc..000000000 --- a/volk/lib/qa_32f_max_aligned16.cc +++ /dev/null @@ -1,70 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32f_max_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32f_max_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - float input1[vlen] __attribute__ ((aligned (16))); - - float output0[vlen] __attribute__ ((aligned (16))); - float output01[vlen] __attribute__ ((aligned (16))); - float output02[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_max_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_max_aligned16_manual(output0, input0, input1, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_max_aligned16_manual(output02, input0, input1, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_max_aligned16_manual(output01, input0, input1, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); - } -} - -#endif diff --git a/volk/lib/qa_32f_max_aligned16.h b/volk/lib/qa_32f_max_aligned16.h deleted file mode 100644 index d535479f4..000000000 --- a/volk/lib/qa_32f_max_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_MAX_ALIGNED16_H -#define INCLUDED_QA_32F_MAX_ALIGNED16_H - -#include -#include - -class qa_32f_max_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_max_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_min_aligned16.cc b/volk/lib/qa_32f_min_aligned16.cc deleted file mode 100644 index 798b47c53..000000000 --- a/volk/lib/qa_32f_min_aligned16.cc +++ /dev/null @@ -1,70 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32f_min_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32f_min_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - float input1[vlen] __attribute__ ((aligned (16))); - - float output0[vlen] __attribute__ ((aligned (16))); - float output01[vlen] __attribute__ ((aligned (16))); - float output02[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_min_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_min_aligned16_manual(output0, input0, input1, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_min_aligned16_manual(output02, input0, input1, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_min_aligned16_manual(output01, input0, input1, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); - } -} - -#endif diff --git a/volk/lib/qa_32f_min_aligned16.h b/volk/lib/qa_32f_min_aligned16.h deleted file mode 100644 index 90961ac92..000000000 --- a/volk/lib/qa_32f_min_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_MIN_ALIGNED16_H -#define INCLUDED_QA_32F_MIN_ALIGNED16_H - -#include -#include - -class qa_32f_min_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_min_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_MIN_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_multiply_aligned16.cc b/volk/lib/qa_32f_multiply_aligned16.cc deleted file mode 100644 index aa17cd62e..000000000 --- a/volk/lib/qa_32f_multiply_aligned16.cc +++ /dev/null @@ -1,123 +0,0 @@ -/* -*- c++ -*- */ -/* - * Copyright 2010 Free Software Foundation, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Radio; see the file COPYING. If not, see - * . - */ - -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32f_multiply_aligned16::t1() { - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 10000; - float input0[vlen] __attribute__ ((aligned (16))); - float input1[vlen] __attribute__ ((aligned (16))); - - float output0[vlen] __attribute__ ((aligned (16))); - float output_known[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - output_known[i] = input0[i] * input1[i]; - } - printf("32f_multiply_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_multiply_aligned16_manual(output0, input0, input1, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - /* - for(int i = 0; i < 10; ++i) { - printf("inputs: %f, %f\n", input0[i], input1[i]); - printf("generic... %f == %f\n", output0[i], output_known[i]); - } - */ - - for(int i = 0; i < vlen; ++i) { - CPPUNIT_ASSERT_EQUAL(output0[i], output_known[i]); - } -} - -#else - -void qa_32f_multiply_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - float input1[vlen] __attribute__ ((aligned (16))); - - float output0[vlen] __attribute__ ((aligned (16))); - float output01[vlen] __attribute__ ((aligned (16))); - float output02[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_multiply_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_multiply_aligned16_manual(output0, input0, input1, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_multiply_aligned16_manual(output02, input0, input1, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_multiply_aligned16_manual(output01, input0, input1, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); - } -} - -#endif diff --git a/volk/lib/qa_32f_multiply_aligned16.h b/volk/lib/qa_32f_multiply_aligned16.h deleted file mode 100644 index 7032a2ad4..000000000 --- a/volk/lib/qa_32f_multiply_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H -#define INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H - -#include -#include - -class qa_32f_multiply_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_multiply_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_normalize_aligned16.cc b/volk/lib/qa_32f_normalize_aligned16.cc deleted file mode 100644 index 0da43ecff..000000000 --- a/volk/lib/qa_32f_normalize_aligned16.cc +++ /dev/null @@ -1,79 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32f_normalize_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32f_normalize_aligned16::t1() { - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - const int vlen = 320001; - const int ITERS = 100; - - float* output0; - float* output01; - float* output02; - ret = posix_memalign((void**)&output0, 16, vlen*sizeof(float)); - ret = posix_memalign((void**)&output01, 16, vlen*sizeof(float)); - ret = posix_memalign((void**)&output02, 16, vlen*sizeof(float)); - - for(int i = 0; i < vlen; ++i) { - output0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - memcpy(output01, output0, vlen*sizeof(float)); - memcpy(output02, output0, vlen*sizeof(float)); - printf("32f_normalize_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_normalize_aligned16_manual(output0, 1.15, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_normalize_aligned16_manual(output01, 1.15, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_normalize_aligned16_manual(output02, 1.15, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - // printf("%e...%e\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output02[i], fabs(output0[i])*1e-4); - } - - free(output0); - free(output01); - free(output02); -} - -#endif diff --git a/volk/lib/qa_32f_normalize_aligned16.h b/volk/lib/qa_32f_normalize_aligned16.h deleted file mode 100644 index 7c421eb82..000000000 --- a/volk/lib/qa_32f_normalize_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H -#define INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H - -#include -#include - -class qa_32f_normalize_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_normalize_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_power_aligned16.cc b/volk/lib/qa_32f_power_aligned16.cc deleted file mode 100644 index 1b331daeb..000000000 --- a/volk/lib/qa_32f_power_aligned16.cc +++ /dev/null @@ -1,95 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#define ERR_DELTA (1e-4) - -//test for sse -static float uniform() { - return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - for (unsigned i = 0; i < n; i++) - buf[i] = uniform (); -} - -#ifdef LV_HAVE_SSE -void qa_32f_power_aligned16::t1() { - - - volk_runtime_init(); - - const int vlen = 2046; - const int ITERS = 10000; - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - float* input; - int i; - - float* result_generic; - float* result_sse; - float* result_sse4_1; - - ret = posix_memalign((void**)&input, 16, vlen * sizeof(float)); - ret = posix_memalign((void**)&result_generic, 16, vlen * sizeof(float)); - ret = posix_memalign((void**)&result_sse, 16, vlen * sizeof(float)); - ret = posix_memalign((void**)&result_sse4_1, 16, vlen * sizeof(float)); - - random_floats((float*)input, vlen); - - const float power = 3; - - printf("32f_power_aligned16\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_power_aligned16_manual(result_generic, input, power, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_power_aligned16_manual(result_sse, input, power, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_32f_power_aligned16(result_sse4_1, input, power, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4.1_time: %f\n", total); - - - for(i = 0; i < vlen; i++){ - //printf("%d %e -> %e %e %e\n", i, input[i], result_generic[i], result_sse[i], result_sse4_1[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(result_generic[i], result_sse[i], fabs(result_generic[i])* ERR_DELTA); - CPPUNIT_ASSERT_DOUBLES_EQUAL(result_generic[i], result_sse4_1[i], fabs(result_generic[i])* ERR_DELTA); - } - - free(input); - free(result_generic); - free(result_sse); - -} -#else -void qa_32f_power_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#endif /* LV_HAVE_SSE */ - diff --git a/volk/lib/qa_32f_power_aligned16.h b/volk/lib/qa_32f_power_aligned16.h deleted file mode 100644 index d45df4e56..000000000 --- a/volk/lib/qa_32f_power_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_POWER_ALIGNED16_H -#define INCLUDED_QA_32F_POWER_ALIGNED16_H - -#include -#include - -class qa_32f_power_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_power_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_POWER_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_sqrt_aligned16.cc b/volk/lib/qa_32f_sqrt_aligned16.cc deleted file mode 100644 index c216ce5d5..000000000 --- a/volk/lib/qa_32f_sqrt_aligned16.cc +++ /dev/null @@ -1,128 +0,0 @@ -/* -*- c++ -*- */ -/* - * Copyright 2010 Free Software Foundation, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Radio; see the file COPYING. If not, see - * . - */ - -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32f_sqrt_aligned16::t1() { - printf("sse not available... no test performed\n"); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 10000; - float input0[vlen] __attribute__ ((aligned (16))); - - float output0[vlen] __attribute__ ((aligned (16))); - float output_known[vlen] __attribute__ ((aligned (16))); - - // No reason to test negative numbers because they result in NaN. - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand()) / static_cast(RAND_MAX)); - output_known[i] = sqrt(input0[i]); - } - printf("32f_sqrt_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - - /* - for(int i = 0; i < 10; ++i) { - printf("inputs: %f\n", input0[i]); - printf("generic... %f == %f\n", output0[i], output_known[i]); - } - */ - - for(int i = 0; i < vlen; ++i) { - CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output_known[i], fabs(output0[i])*1e-4); - } -} - -#else - -void qa_32f_sqrt_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - - float output0[vlen] __attribute__ ((aligned (16))); - float output01[vlen] __attribute__ ((aligned (16))); - - // No reason to test negative numbers because they result in NaN. - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand()) / static_cast(RAND_MAX)); - } - printf("32f_sqrt_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_sqrt_aligned16_manual(output01, input0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4); - } -} - -#endif diff --git a/volk/lib/qa_32f_sqrt_aligned16.h b/volk/lib/qa_32f_sqrt_aligned16.h deleted file mode 100644 index e4b99d981..000000000 --- a/volk/lib/qa_32f_sqrt_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_SQRT_ALIGNED16_H -#define INCLUDED_QA_32F_SQRT_ALIGNED16_H - -#include -#include - -class qa_32f_sqrt_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_sqrt_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_SQRT_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_stddev_aligned16.cc b/volk/lib/qa_32f_stddev_aligned16.cc deleted file mode 100644 index 5934d70df..000000000 --- a/volk/lib/qa_32f_stddev_aligned16.cc +++ /dev/null @@ -1,75 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32f_stddev_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32f_stddev_aligned16::t1() { - volk_runtime_init(); - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - - float stddev_generic; - float stddev_sse; - float stddev_sse4_1; - float mean = 0; - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - mean += input0[i]; - } - mean /= static_cast(vlen); - - printf("32f_stddev_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_stddev_aligned16_manual(&stddev_generic, input0, mean, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_stddev_aligned16_manual(&stddev_sse, input0, mean, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_32f_stddev_aligned16(&stddev_sse4_1, input0, mean, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse, fabs(stddev_generic)*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse4_1, fabs(stddev_generic)*1e-4); - -} - -#endif diff --git a/volk/lib/qa_32f_stddev_aligned16.h b/volk/lib/qa_32f_stddev_aligned16.h deleted file mode 100644 index 7f8d7a5fc..000000000 --- a/volk/lib/qa_32f_stddev_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_STDDEV_ALIGNED16_H -#define INCLUDED_QA_32F_STDDEV_ALIGNED16_H - -#include -#include - -class qa_32f_stddev_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_stddev_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_STDDEV_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_stddev_and_mean_aligned16.cc b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc deleted file mode 100644 index 78c701d78..000000000 --- a/volk/lib/qa_32f_stddev_and_mean_aligned16.cc +++ /dev/null @@ -1,76 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32f_stddev_and_mean_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32f_stddev_and_mean_aligned16::t1() { - volk_runtime_init(); - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - - float stddev_generic; - float stddev_sse; - float stddev_sse4_1; - float mean_generic; - float mean_sse; - float mean_sse4_1; - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_stddev_and_mean_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_stddev_and_mean_aligned16_manual(&stddev_generic, &mean_generic, input0,vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_stddev_and_mean_aligned16_manual(&stddev_sse, &mean_sse, input0,vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_32f_stddev_and_mean_aligned16(&stddev_sse4_1, &mean_sse4_1, input0, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse, fabs(stddev_generic)*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(mean_generic, mean_sse, fabs(mean_generic)*1e-4); - - CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse4_1, fabs(stddev_generic)*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(mean_generic, mean_sse4_1, fabs(mean_generic)*1e-4); - -} - -#endif diff --git a/volk/lib/qa_32f_stddev_and_mean_aligned16.h b/volk/lib/qa_32f_stddev_and_mean_aligned16.h deleted file mode 100644 index e08bd249a..000000000 --- a/volk/lib/qa_32f_stddev_and_mean_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H -#define INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H - -#include -#include - -class qa_32f_stddev_and_mean_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_stddev_and_mean_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_subtract_aligned16.cc b/volk/lib/qa_32f_subtract_aligned16.cc deleted file mode 100644 index 1e2210203..000000000 --- a/volk/lib/qa_32f_subtract_aligned16.cc +++ /dev/null @@ -1,70 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32f_subtract_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32f_subtract_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - float input0[vlen] __attribute__ ((aligned (16))); - float input1[vlen] __attribute__ ((aligned (16))); - - float output0[vlen] __attribute__ ((aligned (16))); - float output01[vlen] __attribute__ ((aligned (16))); - float output02[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("32f_subtract_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_subtract_aligned16_manual(output0, input0, input1, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_subtract_aligned16_manual(output02, input0, input1, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32f_subtract_aligned16_manual(output01, input0, input1, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); - } -} - -#endif diff --git a/volk/lib/qa_32f_subtract_aligned16.h b/volk/lib/qa_32f_subtract_aligned16.h deleted file mode 100644 index 97c14f129..000000000 --- a/volk/lib/qa_32f_subtract_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H -#define INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H - -#include -#include - -class qa_32f_subtract_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_subtract_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H */ diff --git a/volk/lib/qa_32f_sum_of_poly_aligned16.cc b/volk/lib/qa_32f_sum_of_poly_aligned16.cc deleted file mode 100644 index 494776357..000000000 --- a/volk/lib/qa_32f_sum_of_poly_aligned16.cc +++ /dev/null @@ -1,142 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#define SNR 30.0 -#define CENTER -4.0 -#define CUTOFF -5.595 -#define ERR_DELTA (1e-4) -#define NUM_ITERS 100000 -#define VEC_LEN 64 -static float uniform() { - return ((float) rand() / RAND_MAX); // uniformly (0, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - unsigned int i = 0; - for (; i < n; i++) { - - buf[i] = uniform () * -SNR/2.0; - - } -} - - -#ifndef LV_HAVE_SSE3 - -void qa_32f_sum_of_poly_aligned16::t1(){ - printf("sse3 not available... no test performed\n"); -} - -#else - - -void qa_32f_sum_of_poly_aligned16::t1(){ - int i = 0; - - volk_environment_init(); - int ret; - - const int vlen = VEC_LEN; - float cutoff = CUTOFF; - - float* center_point_array; - float* target; - float* target_generic; - float* src0 ; - - - ret = posix_memalign((void**)¢er_point_array, 16, 24); - ret = posix_memalign((void**)&target, 16, 4); - ret = posix_memalign((void**)&target_generic, 16, 4); - ret = posix_memalign((void**)&src0, 16, (vlen << 2)); - - - random_floats((float*)src0, vlen); - - float a = (float)CENTER; - float etoa = expf(a); - center_point_array[0] = (//(5.0 * a * a * a * a)/120.0 + - (-4.0 * a * a * a)/24.0 + - (3.0 * a * a)/6.0 + - (-2.0 * a)/2.0 + - (1.0)) * etoa; - center_point_array[1] = (//(-10.0 * a * a * a)/120.0 + - (6.0 * a * a)/24.0 + - (-3.0 * a)/6.0 + - (1.0/2.0)) * etoa; - center_point_array[2] = (//(10.0 * a * a)/120.0 + - (-4.0 * a)/24.0 + - (1.0/6.0)) * etoa; - center_point_array[3] = (//(-5.0 * a)/120.0 + - (1.0/24.0)) * etoa; - //center_point_array[4] = ((1.0)/120.0) * etoa; - center_point_array[4] = (//(a * a * a * a * a)/120.0 + - (a * a * a * a)/24.0 + - (a * a * a)/-6.0 + - (a * a)/2.0 + - -a + 1.0) * etoa; - - printf("32f_sum_of_poly_aligned16\n"); - - clock_t start, end; - double total; - - float my_sum = 0.0; - start = clock(); - for(int k = 0; k < NUM_ITERS; ++k) { - float sum = 0.0; - for(int l = 0; l < vlen; ++l) { - - sum += expf(src0[l]); - - } - my_sum = sum; - } - - - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("exp time: %f\n", total); - - start = clock(); - for(int k = 0; k < NUM_ITERS; ++k) { - - volk_32f_sum_of_poly_aligned16_manual(target_generic, src0, center_point_array, &cutoff, vlen << 2, "generic"); - - } - - - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic time: %f\n", total); - - start = clock(); - for(int k = 0; k < NUM_ITERS; ++k) { - volk_32f_sum_of_poly_aligned16_manual(target, src0, center_point_array, &cutoff, vlen << 2, "sse3"); - } - - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3 approx time: %f\n", total); - - - - printf("exp: %f, sse3: %f\n", my_sum, target[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], fabs(target_generic[0]) * ERR_DELTA); - - - free(center_point_array); - free(target); - free(target_generic); - free(src0); - - -} - -#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32f_sum_of_poly_aligned16.h b/volk/lib/qa_32f_sum_of_poly_aligned16.h deleted file mode 100644 index 67a347f9a..000000000 --- a/volk/lib/qa_32f_sum_of_poly_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H -#define INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H - -#include -#include - -class qa_32f_sum_of_poly_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32f_sum_of_poly_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.cc b/volk/lib/qa_32fc_32f_multiply_aligned16.cc deleted file mode 100644 index b80e0e008..000000000 --- a/volk/lib/qa_32fc_32f_multiply_aligned16.cc +++ /dev/null @@ -1,75 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#define TOLERANCE (1e-4) - -void qa_32fc_32f_multiply_aligned16(void) { - - const int vlen = 2046; - const int ITERS = 100000; - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - std::complex* input; - float * taps; - int i; - std::vector archs; - archs.push_back("generic"); -#ifdef LV_HAVE_SSE3 - archs.push_back("sse3"); -#endif -#ifdef LV_HAVE_ORC - archs.push_back("orc"); -#endif - - std::vector* > results; - - ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float)); - ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float)); - - for(i=0; i < archs.size(); i++) { - std::complex *ptr; - ret = posix_memalign((void**)&ptr, 16, vlen * 2 * sizeof(float)); - if(ret) { - printf("Couldn't allocate memory\n"); - exit(1); - } - results.push_back(ptr); - } - - random_floats((float*)input, vlen * 2); - random_floats(taps, vlen); - - printf("32fc_32f_multiply_aligned16\n"); - - for(i=0; i < archs.size(); i++) { - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_32f_multiply_aligned16_manual(results[i], input, taps, vlen, archs[i].c_str()); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("%s_time: %f\n", archs[i].c_str(), total); - } - - for(i=0; i < vlen; i++) { - int j = 1; - for(j; j < archs.size(); j++) { - assertcomplexEqual(results[0][i], results[j][i], ERR_DELTA); - } - } - - free(input); - free(taps); - for(i=0; i < archs.size(); i++) { - free(results[i]); - } -} diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.h b/volk/lib/qa_32fc_32f_multiply_aligned16.h deleted file mode 100644 index fc3b3eeb2..000000000 --- a/volk/lib/qa_32fc_32f_multiply_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H -#define INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H - -#include -#include - -class qa_32fc_32f_multiply_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_32f_multiply_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc b/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc deleted file mode 100644 index 64ea65da9..000000000 --- a/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc +++ /dev/null @@ -1,83 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#define assertcomplexEqual(expected, actual, delta) \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); - -#define ERR_DELTA (1.5e-3) - -//test for sse -static float uniform() { - return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - for (unsigned i = 0; i < n; i++) - buf[i] = uniform (); -} - -#ifdef LV_HAVE_SSE -void qa_32fc_32f_power_32fc_aligned16::t1() { - - const int vlen = 2046; - const int ITERS = 10000; - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - std::complex* input; - int i; - - std::complex* result_generic; - std::complex* result_sse; - - ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float)); - ret = posix_memalign((void**)&result_generic, 16, vlen * 2 * sizeof(float)); - ret = posix_memalign((void**)&result_sse, 16, vlen * 2 * sizeof(float)); - - random_floats((float*)input, vlen * 2); - - const float power = 3.2; - - printf("32fc_32f_power_32fc_aligned16\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_32f_power_32fc_aligned16_manual(result_generic, input, power, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_32f_power_32fc_aligned16_manual(result_sse, input, power, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - for(i = 0; i < vlen; i++){ - assertcomplexEqual(result_generic[i], result_sse[i], ERR_DELTA); - } - - free(input); - free(result_generic); - free(result_sse); - -} -#else -void qa_32fc_32f_power_32fc_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#endif /* LV_HAVE_SSE */ - diff --git a/volk/lib/qa_32fc_32f_power_32fc_aligned16.h b/volk/lib/qa_32fc_32f_power_32fc_aligned16.h deleted file mode 100644 index 464b7b7cc..000000000 --- a/volk/lib/qa_32fc_32f_power_32fc_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H -#define INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H - -#include -#include - -class qa_32fc_32f_power_32fc_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_32f_power_32fc_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_atan2_32f_aligned16.cc b/volk/lib/qa_32fc_atan2_32f_aligned16.cc deleted file mode 100644 index c55ab5aa0..000000000 --- a/volk/lib/qa_32fc_atan2_32f_aligned16.cc +++ /dev/null @@ -1,76 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32fc_atan2_32f_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32fc_atan2_32f_aligned16::t1() { - - - volk_runtime_init(); - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 10000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_sse[vlen] __attribute__ ((aligned (16))); - float output_sse4_1[vlen] __attribute__ ((aligned (16))); - - float* inputLoad = (float*)input0; - for(int i = 0; i < 2*vlen; ++i) { - inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); - } - printf("32fc_atan2_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_atan2_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_atan2_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_32fc_atan2_32f_aligned16(output_sse4_1, input0, 32768.0, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4); - } -} - -#endif diff --git a/volk/lib/qa_32fc_atan2_32f_aligned16.h b/volk/lib/qa_32fc_atan2_32f_aligned16.h deleted file mode 100644 index 9c4dc209a..000000000 --- a/volk/lib/qa_32fc_atan2_32f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H -#define INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H - -#include -#include - -class qa_32fc_atan2_32f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_atan2_32f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc deleted file mode 100644 index 2f9a30395..000000000 --- a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc +++ /dev/null @@ -1,138 +0,0 @@ -#include -#include -#include -#include -#include - - -#define assertcomplexEqual(expected, actual, delta) \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); - -#define ERR_DELTA (1e-4) - -//test for sse - -#if LV_HAVE_SSE && LV_HAVE_64 - -static float uniform() { - return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - for (unsigned i = 0; i < n; i++) - buf[i] = uniform () * 32767; -} - - -void qa_32fc_conjugate_dot_prod_aligned16::t1() { - const int vlen = 789743; - - volk_environment_init(); - int ret; - - std::complex* input; - std::complex* taps; - - std::complex* result_generic; - std::complex* result; - - ret = posix_memalign((void**)&input, 16, vlen << 3); - ret = posix_memalign((void**)&taps, 16, vlen << 3); - ret = posix_memalign((void**)&result_generic, 16, 8); - ret = posix_memalign((void**)&result, 16, 8); - - - result_generic[0] = std::complex(0,0); - result[0] = std::complex(0,0); - - random_floats((float*)input, vlen * 2); - random_floats((float*)taps, vlen * 2); - - - - volk_32fc_conjugate_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); - - - volk_32fc_conjugate_dot_prod_aligned16_manual(result, input, taps, vlen * 8, "sse"); - - printf("32fc_conjugate_dot_prod_aligned16\n"); - printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); - - assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); - - free(input); - free(taps); - free(result_generic); - free(result); - -} - - -#elif LV_HAVE_SSE && LV_HAVE_32 - -static float uniform() { - return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - for (unsigned i = 0; i < n; i++) - buf[i] = uniform () * 32767; -} - - -void qa_32fc_conjugate_dot_prod_aligned16::t1() { - const int vlen = 789743; - - volk_environment_init(); - int ret; - - std::complex* input; - std::complex* taps; - - std::complex* result_generic; - std::complex* result; - - ret = posix_memalign((void**)&input, 16, vlen << 3); - ret = posix_memalign((void**)&taps, 16, vlen << 3); - ret = posix_memalign((void**)&result_generic, 16, 8); - ret = posix_memalign((void**)&result, 16, 8); - - - result_generic[0] = std::complex(0,0); - result[0] = std::complex(0,0); - - random_floats((float*)input, vlen * 2); - random_floats((float*)taps, vlen * 2); - - - - volk_32fc_conjugate_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); - - - volk_32fc_conjugate_dot_prod_aligned16_manual(result, input, taps, vlen * 8, "sse_32"); - - printf("32fc_conjugate_dot_prod_aligned16\n"); - printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); - - assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); - - free(input); - free(taps); - free(result_generic); - free(result); - -} - - -#else - -void qa_32fc_conjugate_dot_prod_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h deleted file mode 100644 index 507b1769b..000000000 --- a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H -#define INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H - -#include -#include - -class qa_32fc_conjugate_dot_prod_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_conjugate_dot_prod_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc deleted file mode 100644 index 72e084c05..000000000 --- a/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc +++ /dev/null @@ -1,64 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32fc_deinterleave_32f_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32fc_deinterleave_32f_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_generic1[vlen] __attribute__ ((aligned (16))); - float output_sse[vlen] __attribute__ ((aligned (16))); - float output_sse1[vlen] __attribute__ ((aligned (16))); - - float* inputLoad = (float*)input0; - for(int i = 0; i < 2*vlen; ++i) { - inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); - } - printf("32fc_deinterleave_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], fabs(output_generic1[i])*1e-4); - } -} - -#endif diff --git a/volk/lib/qa_32fc_deinterleave_32f_aligned16.h b/volk/lib/qa_32fc_deinterleave_32f_aligned16.h deleted file mode 100644 index 78660e6ad..000000000 --- a/volk/lib/qa_32fc_deinterleave_32f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H -#define INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H - -#include -#include - -class qa_32fc_deinterleave_32f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_32f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc deleted file mode 100644 index 89770c236..000000000 --- a/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc +++ /dev/null @@ -1,64 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_32fc_deinterleave_64f_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_32fc_deinterleave_64f_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - double output_generic[vlen] __attribute__ ((aligned (16))); - double output_generic1[vlen] __attribute__ ((aligned (16))); - double output_sse2[vlen] __attribute__ ((aligned (16))); - double output_sse21[vlen] __attribute__ ((aligned (16))); - - float* inputLoad = (float*)input0; - for(int i = 0; i < 2*vlen; ++i) { - inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); - } - printf("32fc_deinterleave_64f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_deinterleave_64f_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_deinterleave_64f_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse21[i], fabs(output_generic1[i])*1e-4); - } -} - -#endif diff --git a/volk/lib/qa_32fc_deinterleave_64f_aligned16.h b/volk/lib/qa_32fc_deinterleave_64f_aligned16.h deleted file mode 100644 index f924b9752..000000000 --- a/volk/lib/qa_32fc_deinterleave_64f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H -#define INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H - -#include -#include - -class qa_32fc_deinterleave_64f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_64f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc deleted file mode 100644 index 7472476f7..000000000 --- a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32fc_deinterleave_real_16s_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32fc_deinterleave_real_16s_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - int16_t output_generic[vlen] __attribute__ ((aligned (16))); - int16_t output_sse[vlen] __attribute__ ((aligned (16))); - - float* inputLoad = (float*)input0; - for(int i = 0; i < 2*vlen; ++i) { - inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); - } - printf("32fc_deinterleave_real_16s_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_deinterleave_real_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_deinterleave_real_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); - } -} - -#endif diff --git a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h deleted file mode 100644 index 68b80f27d..000000000 --- a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H -#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H - -#include -#include - -class qa_32fc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_16s_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc deleted file mode 100644 index 5cbdc49b3..000000000 --- a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32fc_deinterleave_real_32f_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32fc_deinterleave_real_32f_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_sse[vlen] __attribute__ ((aligned (16))); - - float* inputLoad = (float*)input0; - for(int i = 0; i < 2*vlen; ++i) { - inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); - } - printf("32fc_deinterleave_real_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_deinterleave_real_32f_aligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_deinterleave_real_32f_aligned16_manual(output_sse, input0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); - } -} - -#endif diff --git a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h deleted file mode 100644 index 765450bb6..000000000 --- a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H -#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H - -#include -#include - -class qa_32fc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_32f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc deleted file mode 100644 index 4147e30ae..000000000 --- a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE2 - -void qa_32fc_deinterleave_real_64f_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_32fc_deinterleave_real_64f_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - double output_generic[vlen] __attribute__ ((aligned (16))); - double output_sse2[vlen] __attribute__ ((aligned (16))); - - float* inputLoad = (float*)input0; - for(int i = 0; i < 2*vlen; ++i) { - inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); - } - printf("32fc_deinterleave_real_64f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_deinterleave_real_64f_aligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_deinterleave_real_64f_aligned16_manual(output_sse2, input0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4); - } -} - -#endif diff --git a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h deleted file mode 100644 index 3e55fb812..000000000 --- a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H -#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H - -#include -#include - -class qa_32fc_deinterleave_real_64f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_64f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_dot_prod_aligned16.cc b/volk/lib/qa_32fc_dot_prod_aligned16.cc deleted file mode 100644 index bcf9ea954..000000000 --- a/volk/lib/qa_32fc_dot_prod_aligned16.cc +++ /dev/null @@ -1,214 +0,0 @@ -#include -#include -#include -#include -#include -#include - - - -#define assertcomplexEqual(expected, actual, delta) \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); - -#define ERR_DELTA (1e-4) - -//test for sse -static float uniform() { - return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - for (unsigned i = 0; i < n; i++) - buf[i] = uniform (); -} - - - -#if LV_HAVE_SSE3 -void qa_32fc_dot_prod_aligned16::t1() { - - const int vlen = 2046; - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - std::complex* input; - std::complex* taps; - - std::complex* result_generic; - std::complex* result_sse3; - - ret = posix_memalign((void**)&input, 16, vlen << 3); - ret = posix_memalign((void**)&taps, 16, vlen << 3); - ret = posix_memalign((void**)&result_generic, 16, 8); - ret = posix_memalign((void**)&result_sse3, 16, 8); - - - result_generic[0] = std::complex(0,0); - result_sse3[0] = std::complex(0,0); - - random_floats((float*)input, vlen * 2); - random_floats((float*)taps, vlen * 2); - - printf("32fc_dot_prod_aligned16\n"); - - start = clock(); - volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - - start = clock(); - volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse3"); - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3_time: %f\n", total); - - printf("generic: %f +i%f ... sse3: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0])); - - - assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA); - - free(input); - free(taps); - free(result_generic); - free(result_sse3); - -} - -#else -void qa_32fc_dot_prod_aligned16::t1() { - printf("sse3 not available... no test performed\n"); -} - -#endif - -#if LV_HAVE_SSE && LV_HAVE_32 -void qa_32fc_dot_prod_aligned16::t2() { - - const int vlen = 2046; - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - std::complex* input; - std::complex* taps; - - std::complex* result_generic; - std::complex* result_sse3; - - ret = posix_memalign((void**)&input, 16, vlen << 3); - ret = posix_memalign((void**)&taps, 16, vlen << 3); - ret = posix_memalign((void**)&result_generic, 16, 8); - ret = posix_memalign((void**)&result_sse3, 16, 8); - - - result_generic[0] = std::complex(0,0); - result_sse3[0] = std::complex(0,0); - - random_floats((float*)input, vlen * 2); - random_floats((float*)taps, vlen * 2); - - printf("32fc_dot_prod_aligned16\n"); - - start = clock(); - volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - - start = clock(); - volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse_32"); - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_32_time: %f\n", total); - - printf("generic: %f +i%f ... sse_32: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0])); - - - assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA); - - free(input); - free(taps); - free(result_generic); - free(result_sse3); - -} - -#else -void qa_32fc_dot_prod_aligned16::t2() { - printf("sse_32 not available... no test performed\n"); -} - -#endif - -#if LV_HAVE_SSE && LV_HAVE_64 - -void qa_32fc_dot_prod_aligned16::t3() { - - const int vlen = 2046; - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - std::complex* input; - std::complex* taps; - - std::complex* result_generic; - std::complex* result_sse3; - - ret = posix_memalign((void**)&input, 16, vlen << 3); - ret = posix_memalign((void**)&taps, 16, vlen << 3); - ret = posix_memalign((void**)&result_generic, 16, 8); - ret = posix_memalign((void**)&result_sse3, 16, 8); - - - result_generic[0] = std::complex(0,0); - result_sse3[0] = std::complex(0,0); - - random_floats((float*)input, vlen * 2); - random_floats((float*)taps, vlen * 2); - - printf("32fc_dot_prod_aligned16\n"); - - start = clock(); - volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8, "generic"); - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - - start = clock(); - volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse_64"); - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_64_time: %f\n", total); - - printf("generic: %f +i%f ... sse_64: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0])); - - - assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA); - - free(input); - free(taps); - free(result_generic); - free(result_sse3); - -} - -#else -void qa_32fc_dot_prod_aligned16::t3() { - printf("sse_64 not available... no test performed\n"); -} - - - -#endif diff --git a/volk/lib/qa_32fc_dot_prod_aligned16.h b/volk/lib/qa_32fc_dot_prod_aligned16.h deleted file mode 100644 index 4b360db27..000000000 --- a/volk/lib/qa_32fc_dot_prod_aligned16.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H -#define INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H - -#include -#include - -class qa_32fc_dot_prod_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_dot_prod_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); - void t2 (); - void t3 (); -}; - - -#endif /* INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc deleted file mode 100644 index c718b6b71..000000000 --- a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc +++ /dev/null @@ -1,80 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE3 - -void qa_32fc_magnitude_16s_aligned16::t1() { - printf("sse3 not available... no test performed\n"); -} - -#else - -void qa_32fc_magnitude_16s_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - int16_t output_generic[vlen] __attribute__ ((aligned (16))); - int16_t output_orc[vlen] __attribute__ ((aligned (16))); - int16_t output_sse[vlen] __attribute__ ((aligned (16))); - int16_t output_sse3[vlen] __attribute__ ((aligned (16))); - - float* inputLoad = (float*)input0; - for(int i = 0; i < 2*vlen; ++i) { - inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); - } - printf("32fc_magnitude_16s_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_magnitude_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_magnitude_16s_aligned16_manual(output_orc, input0, 32768.0, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_magnitude_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_magnitude_16s_aligned16_manual(output_sse3, input0, 32768.0, vlen, "sse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - // printf("inputs: %f, %f\n", input0[i].real(), input0[i].imag()); - // printf("generic... %i, sse3... %i, orc... %i\n", output_generic[i], output_sse3[i], output_orc[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], 1.1); - } -} - -#endif diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.h b/volk/lib/qa_32fc_magnitude_16s_aligned16.h deleted file mode 100644 index ffdf1dd9e..000000000 --- a/volk/lib/qa_32fc_magnitude_16s_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H -#define INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H - -#include -#include - -class qa_32fc_magnitude_16s_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_magnitude_16s_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc deleted file mode 100644 index 1d475fb86..000000000 --- a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc +++ /dev/null @@ -1,80 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE3 - -void qa_32fc_magnitude_32f_aligned16::t1() { - printf("sse3 not available... no test performed\n"); -} - -#else - -void qa_32fc_magnitude_32f_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_orc[vlen] __attribute__ ((aligned (16))); - float output_sse[vlen] __attribute__ ((aligned (16))); - float output_sse3[vlen] __attribute__ ((aligned (16))); - - float* inputLoad = (float*)input0; - for(int i = 0; i < 2*vlen; ++i) { - inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); - } - printf("32fc_magnitude_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_magnitude_32f_aligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_magnitude_32f_aligned16_manual(output_orc, input0, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_magnitude_32f_aligned16_manual(output_sse, input0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_magnitude_32f_aligned16_manual(output_sse3, input0, vlen, "sse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_orc[i], fabs(output_generic[i])*1e-4); - } -} - -#endif diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.h b/volk/lib/qa_32fc_magnitude_32f_aligned16.h deleted file mode 100644 index a2881308c..000000000 --- a/volk/lib/qa_32fc_magnitude_32f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H -#define INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H - -#include -#include - -class qa_32fc_magnitude_32f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_magnitude_32f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_multiply_aligned16.cc b/volk/lib/qa_32fc_multiply_aligned16.cc deleted file mode 100644 index 022b58ad6..000000000 --- a/volk/lib/qa_32fc_multiply_aligned16.cc +++ /dev/null @@ -1,98 +0,0 @@ -#include -#include -#include -#include -#include -#include - - - -#define assertcomplexEqual(expected, actual, delta) \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); - -#define ERR_DELTA (1e-3) - -//test for sse -static float uniform() { - return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - for (unsigned i = 0; i < n; i++) - buf[i] = uniform (); -} - -#ifdef LV_HAVE_SSE3 -void qa_32fc_multiply_aligned16::t1() { - - const int vlen = 2046; - const int ITERS = 100000; - - int i; - volk_environment_init(); - int ret; - clock_t start, end; - double total; - std::complex* input; - std::complex* taps; - - std::complex* result_generic; - std::complex* result_sse3; - std::complex* result_orc; - - ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(float)); - ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(float)); - ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float)); - ret = posix_memalign((void**)&result_sse3, 16, vlen*2*sizeof(float)); - ret = posix_memalign((void**)&result_orc, 16, vlen*2*sizeof(float)); - - random_floats((float*)input, vlen * 2); - random_floats((float*)taps, vlen * 2); - - printf("32fc_multiply_aligned16\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_multiply_aligned16_manual(result_generic, input, taps, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_multiply_aligned16_manual(result_sse3, input, taps, vlen, "sse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_multiply_aligned16_manual(result_orc, input, taps, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - - for(i = 0; i < vlen; i++){ - assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA); - assertcomplexEqual(result_generic[i], result_orc[i], ERR_DELTA); - } - - free(input); - free(taps); - free(result_generic); - free(result_sse3); - free(result_orc); - -} -#else -void qa_32fc_multiply_aligned16::t1() { - printf("sse3 not available... no test performed\n"); -} - -#endif /* LV_HAVE_SSE3 */ diff --git a/volk/lib/qa_32fc_multiply_aligned16.h b/volk/lib/qa_32fc_multiply_aligned16.h deleted file mode 100644 index c8abaa8fe..000000000 --- a/volk/lib/qa_32fc_multiply_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H -#define INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H - -#include -#include - -class qa_32fc_multiply_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_multiply_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc deleted file mode 100644 index 1444c78a9..000000000 --- a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc +++ /dev/null @@ -1,64 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse3 - -#ifndef LV_HAVE_SSE3 - -void qa_32fc_power_spectrum_32f_aligned16::t1() { - printf("sse3 not available... no test performed\n"); -} - -#else - -void qa_32fc_power_spectrum_32f_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 10000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_sse3[vlen] __attribute__ ((aligned (16))); - - const float scalar = vlen; - - float* inputLoad = (float*)input0; - for(int i = 0; i < 2*vlen; ++i) { - inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); - } - - printf("32fc_power_spectrum_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_power_spectrum_32f_aligned16_manual(output_generic, input0, scalar, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32fc_power_spectrum_32f_aligned16_manual(output_sse3, input0, scalar, vlen, "sse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse33... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4)); - } -} - -#endif diff --git a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h deleted file mode 100644 index d991223f3..000000000 --- a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H -#define INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H - -#include -#include - -class qa_32fc_power_spectrum_32f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_power_spectrum_32f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_square_dist_aligned16.cc b/volk/lib/qa_32fc_square_dist_aligned16.cc deleted file mode 100644 index d9ead8495..000000000 --- a/volk/lib/qa_32fc_square_dist_aligned16.cc +++ /dev/null @@ -1,91 +0,0 @@ -#include -#include -#include -#include -#include - -#define ERR_DELTA (1e-4) -#define NUM_ITERS 10000000 -#define VEC_LEN 64 -static float uniform() { - return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - unsigned int i = 0; - for (; i < n; i++) { - - buf[i] = uniform () * 32767; - - } -} - - -#ifndef LV_HAVE_SSE3 - -void qa_32fc_square_dist_aligned16::t1(){ - printf("sse3 not available... no test performed\n"); -} - -#else - - -void qa_32fc_square_dist_aligned16::t1(){ - int i = 0; - - const int vlen = VEC_LEN; - volk_environment_init(); - int ret; - - float* target; - float* target_generic; - std::complex* src0 ; - std::complex* points; - - ret = posix_memalign((void**)&points, 16, vlen << 3); - ret = posix_memalign((void**)&target, 16, vlen << 2); - ret = posix_memalign((void**)&target_generic, 16, vlen << 2); - ret = posix_memalign((void**)&src0, 16, 8); - - random_floats((float*)points, vlen * 2); - random_floats((float*)src0, 2); - - printf("32fc_square_dist_aligned16\n"); - - clock_t start, end; - double total; - - - start = clock(); - for(int k = 0; k < NUM_ITERS; ++k) { - volk_32fc_square_dist_aligned16_manual(target_generic, src0, points, vlen << 3, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic time: %f\n", total); - - start = clock(); - for(int k = 0; k < NUM_ITERS; ++k) { - volk_32fc_square_dist_aligned16_manual(target, src0, points, vlen << 3, "sse3"); - } - - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3 time: %f\n", total); - - - - for(; i < vlen; ++i) { - //printf("generic: %f, sse3: %f\n", target_generic[i], target[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[i], target[i], fabs(target_generic[i]) * ERR_DELTA); - } - - free(target); - free(target_generic); - free(points); - free(src0); -} - -#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32fc_square_dist_aligned16.h b/volk/lib/qa_32fc_square_dist_aligned16.h deleted file mode 100644 index 9d365d8b0..000000000 --- a/volk/lib/qa_32fc_square_dist_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H -#define INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H - -#include -#include - -class qa_32fc_square_dist_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_square_dist_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H */ diff --git a/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc deleted file mode 100644 index f923d1d5c..000000000 --- a/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc +++ /dev/null @@ -1,96 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#define ERR_DELTA .0001 -#define NUM_ITERS 10000000 -#define VEC_LEN 64 - -static float uniform() { - return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - unsigned int i = 0; - for (; i < n; i++) { - - buf[i] = uniform () * 32767; - - } -} - - -#ifndef LV_HAVE_SSE3 - -void qa_32fc_square_dist_scalar_mult_aligned16::t1(){ - printf("sse3 not available... no test performed\n"); -} - -#else - - -void qa_32fc_square_dist_scalar_mult_aligned16::t1(){ - int i = 0; - - const int vlen = VEC_LEN; - - volk_environment_init(); - int ret; - - float* target; - float* target_generic; - std::complex* src0 ; - std::complex* points; - float scalar; - - ret = posix_memalign((void**)&points, 16, vlen << 3); - ret = posix_memalign((void**)&target, 16, vlen << 2); - ret = posix_memalign((void**)&target_generic, 16, vlen << 2); - ret = posix_memalign((void**)&src0, 16, 8); - - random_floats((float*)points, vlen * 2); - random_floats((float*)src0, 2); - random_floats(&scalar, 1); - - printf("32fc_square_dist_scalar_mult_aligned16\n"); - - clock_t start, end; - double total; - - - start = clock(); - for(int k = 0; k < NUM_ITERS; ++k) { - volk_32fc_square_dist_scalar_mult_aligned16_manual(target_generic, src0, points, scalar, vlen << 3, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic time: %f\n", total); - - start = clock(); - for(int k = 0; k < NUM_ITERS; ++k) { - volk_32fc_square_dist_scalar_mult_aligned16_manual(target, src0, points, scalar, vlen << 3, "sse3"); - } - - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse3 time: %f\n", total); - - - - for(i = 0; i < vlen; ++i) { - printf("generic: %f, sse3: %f\n", target_generic[i], target[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(target[i], target_generic[i], fabs(target_generic[1]) * ERR_DELTA);//, target_generic[1] * ERR_DELTA); - } - - free(target); - free(target_generic); - free(points); - free(src0); -} - -#endif /*LV_HAVE_SSE3*/ diff --git a/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h deleted file mode 100644 index ac4e3c45b..000000000 --- a/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H -#define INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H - -#include -#include - -class qa_32fc_square_dist_scalar_mult_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_square_dist_scalar_mult_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H */ diff --git a/volk/lib/qa_32s_and_aligned16.cc b/volk/lib/qa_32s_and_aligned16.cc deleted file mode 100644 index d20682147..000000000 --- a/volk/lib/qa_32s_and_aligned16.cc +++ /dev/null @@ -1,70 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32s_and_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32s_and_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - int32_t input0[vlen] __attribute__ ((aligned (16))); - int32_t input1[vlen] __attribute__ ((aligned (16))); - - int32_t output0[vlen] __attribute__ ((aligned (16))); - int32_t output01[vlen] __attribute__ ((aligned (16))); - int32_t output02[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((int32_t) (rand() - (RAND_MAX/2))); - input1[i] = ((int32_t) (rand() - (RAND_MAX/2))); - } - printf("32s_and_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32s_and_aligned16_manual(output0, input0, input1, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32s_and_aligned16_manual(output02, input0, input1, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32s_and_aligned16_manual(output01, input0, input1, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); - } -} - -#endif diff --git a/volk/lib/qa_32s_and_aligned16.h b/volk/lib/qa_32s_and_aligned16.h deleted file mode 100644 index dfcb47c63..000000000 --- a/volk/lib/qa_32s_and_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32S_AND_ALIGNED16_H -#define INCLUDED_QA_32S_AND_ALIGNED16_H - -#include -#include - -class qa_32s_and_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32s_and_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32S_AND_ALIGNED16_H */ diff --git a/volk/lib/qa_32s_convert_32f_aligned16.cc b/volk/lib/qa_32s_convert_32f_aligned16.cc deleted file mode 100644 index 07d799809..000000000 --- a/volk/lib/qa_32s_convert_32f_aligned16.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_32s_convert_32f_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_32s_convert_32f_aligned16::t1() { - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - - int32_t input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((int32_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); - } - printf("32s_convert_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32s_convert_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32s_convert_32f_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); - } -} - -#endif diff --git a/volk/lib/qa_32s_convert_32f_aligned16.h b/volk/lib/qa_32s_convert_32f_aligned16.h deleted file mode 100644 index efd2a2eea..000000000 --- a/volk/lib/qa_32s_convert_32f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H -#define INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H - -#include -#include - -class qa_32s_convert_32f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32s_convert_32f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_32s_convert_32f_unaligned16.cc b/volk/lib/qa_32s_convert_32f_unaligned16.cc deleted file mode 100644 index 2ec610ffb..000000000 --- a/volk/lib/qa_32s_convert_32f_unaligned16.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_32s_convert_32f_unaligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_32s_convert_32f_unaligned16::t1() { - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - - int32_t input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((int32_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 32768.0)); - } - printf("32s_convert_32f_unaligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32s_convert_32f_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32s_convert_32f_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); - } -} - -#endif diff --git a/volk/lib/qa_32s_convert_32f_unaligned16.h b/volk/lib/qa_32s_convert_32f_unaligned16.h deleted file mode 100644 index 5006f5fd8..000000000 --- a/volk/lib/qa_32s_convert_32f_unaligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H -#define INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H - -#include -#include - -class qa_32s_convert_32f_unaligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32s_convert_32f_unaligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_32s_or_aligned16.cc b/volk/lib/qa_32s_or_aligned16.cc deleted file mode 100644 index bebf779b0..000000000 --- a/volk/lib/qa_32s_or_aligned16.cc +++ /dev/null @@ -1,70 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE - -void qa_32s_or_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_32s_or_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - int32_t input0[vlen] __attribute__ ((aligned (16))); - int32_t input1[vlen] __attribute__ ((aligned (16))); - - int32_t output0[vlen] __attribute__ ((aligned (16))); - int32_t output01[vlen] __attribute__ ((aligned (16))); - int32_t output02[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((int32_t) (rand() - (RAND_MAX/2))); - input1[i] = ((int32_t) (rand() - (RAND_MAX/2))); - } - printf("32s_or_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32s_or_aligned16_manual(output0, input0, input1, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32s_or_aligned16_manual(output02, input0, input1, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32s_or_aligned16_manual(output01, input0, input1, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output02[i]); - } -} - -#endif diff --git a/volk/lib/qa_32s_or_aligned16.h b/volk/lib/qa_32s_or_aligned16.h deleted file mode 100644 index 9e949eb52..000000000 --- a/volk/lib/qa_32s_or_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32S_OR_ALIGNED16_H -#define INCLUDED_QA_32S_OR_ALIGNED16_H - -#include -#include - -class qa_32s_or_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32s_or_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32S_OR_ALIGNED16_H */ diff --git a/volk/lib/qa_32u_byteswap_aligned16.cc b/volk/lib/qa_32u_byteswap_aligned16.cc deleted file mode 100644 index 313c786b6..000000000 --- a/volk/lib/qa_32u_byteswap_aligned16.cc +++ /dev/null @@ -1,60 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE2 - -void qa_32u_byteswap_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_32u_byteswap_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100001; - - uint32_t output0[vlen] __attribute__ ((aligned (16))); - uint32_t output01[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - output0[i] = (uint32_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2)); - } - memcpy(output01, output0, vlen*sizeof(uint32_t)); - printf("32u_byteswap_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32u_byteswap_aligned16_manual(output0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_32u_byteswap_aligned16_manual(output01, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); - } -} - -#endif diff --git a/volk/lib/qa_32u_byteswap_aligned16.h b/volk/lib/qa_32u_byteswap_aligned16.h deleted file mode 100644 index 47bad4c3d..000000000 --- a/volk/lib/qa_32u_byteswap_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H -#define INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H - -#include -#include - -class qa_32u_byteswap_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32u_byteswap_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H */ diff --git a/volk/lib/qa_64f_convert_32f_aligned16.cc b/volk/lib/qa_64f_convert_32f_aligned16.cc deleted file mode 100644 index 7f9c4584a..000000000 --- a/volk/lib/qa_64f_convert_32f_aligned16.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_64f_convert_32f_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_64f_convert_32f_aligned16::t1() { - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - - double input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("64f_convert_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_64f_convert_32f_aligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_64f_convert_32f_aligned16_manual(output_sse2, input0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); - } -} - -#endif diff --git a/volk/lib/qa_64f_convert_32f_aligned16.h b/volk/lib/qa_64f_convert_32f_aligned16.h deleted file mode 100644 index 95d79f73d..000000000 --- a/volk/lib/qa_64f_convert_32f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H -#define INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H - -#include -#include - -class qa_64f_convert_32f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_64f_convert_32f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_64f_convert_32f_unaligned16.cc b/volk/lib/qa_64f_convert_32f_unaligned16.cc deleted file mode 100644 index 98aadbf4d..000000000 --- a/volk/lib/qa_64f_convert_32f_unaligned16.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse2 - -#ifndef LV_HAVE_SSE2 - -void qa_64f_convert_32f_unaligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_64f_convert_32f_unaligned16::t1() { - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - - double input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_sse2[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("64f_convert_32f_unaligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_64f_convert_32f_unaligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_64f_convert_32f_unaligned16_manual(output_sse2, input0, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]); - } -} - -#endif diff --git a/volk/lib/qa_64f_convert_32f_unaligned16.h b/volk/lib/qa_64f_convert_32f_unaligned16.h deleted file mode 100644 index 430327e81..000000000 --- a/volk/lib/qa_64f_convert_32f_unaligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H -#define INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H - -#include -#include - -class qa_64f_convert_32f_unaligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_64f_convert_32f_unaligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_64f_max_aligned16.cc b/volk/lib/qa_64f_max_aligned16.cc deleted file mode 100644 index 76e755514..000000000 --- a/volk/lib/qa_64f_max_aligned16.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE2 - -void qa_64f_max_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_64f_max_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - double input0[vlen] __attribute__ ((aligned (16))); - double input1[vlen] __attribute__ ((aligned (16))); - - double output0[vlen] __attribute__ ((aligned (16))); - double output01[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - input1[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("64f_max_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_64f_max_aligned16_manual(output0, input0, input1, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_64f_max_aligned16_manual(output01, input0, input1, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); - } -} - -#endif diff --git a/volk/lib/qa_64f_max_aligned16.h b/volk/lib/qa_64f_max_aligned16.h deleted file mode 100644 index 7cbd4d4c1..000000000 --- a/volk/lib/qa_64f_max_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_64F_MAX_ALIGNED16_H -#define INCLUDED_QA_64F_MAX_ALIGNED16_H - -#include -#include - -class qa_64f_max_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_64f_max_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_64F_MAX_ALIGNED16_H */ diff --git a/volk/lib/qa_64f_min_aligned16.cc b/volk/lib/qa_64f_min_aligned16.cc deleted file mode 100644 index 4b70d2881..000000000 --- a/volk/lib/qa_64f_min_aligned16.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE2 - -void qa_64f_min_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_64f_min_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - double input0[vlen] __attribute__ ((aligned (16))); - double input1[vlen] __attribute__ ((aligned (16))); - - double output0[vlen] __attribute__ ((aligned (16))); - double output01[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - input1[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); - } - printf("64f_min_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_64f_min_aligned16_manual(output0, input0, input1, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_64f_min_aligned16_manual(output01, input0, input1, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); - } -} - -#endif diff --git a/volk/lib/qa_64f_min_aligned16.h b/volk/lib/qa_64f_min_aligned16.h deleted file mode 100644 index a0e95395f..000000000 --- a/volk/lib/qa_64f_min_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_64F_MIN_ALIGNED16_H -#define INCLUDED_QA_64F_MIN_ALIGNED16_H - -#include -#include - -class qa_64f_min_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_64f_min_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_64F_MIN_ALIGNED16_H */ diff --git a/volk/lib/qa_64u_byteswap_aligned16.cc b/volk/lib/qa_64u_byteswap_aligned16.cc deleted file mode 100644 index 20d012c9e..000000000 --- a/volk/lib/qa_64u_byteswap_aligned16.cc +++ /dev/null @@ -1,60 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE2 - -void qa_64u_byteswap_aligned16::t1() { - printf("sse2 not available... no test performed\n"); -} - -#else - -void qa_64u_byteswap_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100001; - - uint64_t output0[vlen] __attribute__ ((aligned (16))); - uint64_t output01[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - output0[i] = (uint64_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2)); - } - memcpy(output01, output0, vlen*sizeof(uint64_t)); - printf("64u_byteswap_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_64u_byteswap_aligned16_manual(output0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_64u_byteswap_aligned16_manual(output01, vlen, "sse2"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse2_time: %f\n", total); - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); - } -} - -#endif diff --git a/volk/lib/qa_64u_byteswap_aligned16.h b/volk/lib/qa_64u_byteswap_aligned16.h deleted file mode 100644 index a4fa0c983..000000000 --- a/volk/lib/qa_64u_byteswap_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H -#define INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H - -#include -#include - -class qa_64u_byteswap_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_64u_byteswap_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_16s_aligned16.cc b/volk/lib/qa_8s_convert_16s_aligned16.cc deleted file mode 100644 index 8dd5f76ca..000000000 --- a/volk/lib/qa_8s_convert_16s_aligned16.cc +++ /dev/null @@ -1,64 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse4_1 - -#ifndef LV_HAVE_SSE4_1 - -void qa_8s_convert_16s_aligned16::t1() { - printf("sse4.1 not available... no test performed\n"); -} - -#else - -void qa_8s_convert_16s_aligned16::t1() { - - volk_runtime_init(); - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - int8_t input0[vlen] __attribute__ ((aligned (16))); - - int16_t output_generic[vlen] __attribute__ ((aligned (16))); - int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 128.0)); - } - printf("8s_convert_16s_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8s_convert_16s_aligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_8s_convert_16s_aligned16(output_sse4_1, input0, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); - } -} - -#endif diff --git a/volk/lib/qa_8s_convert_16s_aligned16.h b/volk/lib/qa_8s_convert_16s_aligned16.h deleted file mode 100644 index 38739fc96..000000000 --- a/volk/lib/qa_8s_convert_16s_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H -#define INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H - -#include -#include - -class qa_8s_convert_16s_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_8s_convert_16s_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_16s_unaligned16.cc b/volk/lib/qa_8s_convert_16s_unaligned16.cc deleted file mode 100644 index 12c502d4b..000000000 --- a/volk/lib/qa_8s_convert_16s_unaligned16.cc +++ /dev/null @@ -1,64 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse4_1 - -#ifndef LV_HAVE_SSE4_1 - -void qa_8s_convert_16s_unaligned16::t1() { - printf("sse4.1 not available... no test performed\n"); -} - -#else - -void qa_8s_convert_16s_unaligned16::t1() { - - volk_runtime_init(); - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - int8_t input0[vlen] __attribute__ ((aligned (16))); - - int16_t output_generic[vlen] __attribute__ ((aligned (16))); - int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 128.0)); - } - printf("8s_convert_16s_unaligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8s_convert_16s_unaligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_8s_convert_16s_unaligned16(output_sse4_1, input0, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); - } -} - -#endif diff --git a/volk/lib/qa_8s_convert_16s_unaligned16.h b/volk/lib/qa_8s_convert_16s_unaligned16.h deleted file mode 100644 index d39fffc35..000000000 --- a/volk/lib/qa_8s_convert_16s_unaligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H -#define INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H - -#include -#include - -class qa_8s_convert_16s_unaligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_8s_convert_16s_unaligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_32f_aligned16.cc b/volk/lib/qa_8s_convert_32f_aligned16.cc deleted file mode 100644 index f27e60552..000000000 --- a/volk/lib/qa_8s_convert_32f_aligned16.cc +++ /dev/null @@ -1,72 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse4.1 - -#ifndef LV_HAVE_SSE4_1 - -void qa_8s_convert_32f_aligned16::t1() { - printf("sse4_1 not available... no test performed\n"); -} - -#else - -void qa_8s_convert_32f_aligned16::t1() { - - volk_runtime_init(); - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - int8_t input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_sse4_1[vlen] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 128.0)); - } - printf("8s_convert_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8s_convert_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8s_convert_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "orc"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("orc_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_8s_convert_32f_aligned16(output_sse4_1, input0, 128.0, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); - } -} - -#endif diff --git a/volk/lib/qa_8s_convert_32f_aligned16.h b/volk/lib/qa_8s_convert_32f_aligned16.h deleted file mode 100644 index 7f8401d42..000000000 --- a/volk/lib/qa_8s_convert_32f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H -#define INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H - -#include -#include - -class qa_8s_convert_32f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_8s_convert_32f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_8s_convert_32f_unaligned16.cc b/volk/lib/qa_8s_convert_32f_unaligned16.cc deleted file mode 100644 index 43468b1b1..000000000 --- a/volk/lib/qa_8s_convert_32f_unaligned16.cc +++ /dev/null @@ -1,64 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse4.1 - -#ifndef LV_HAVE_SSE4_1 - -void qa_8s_convert_32f_unaligned16::t1() { - printf("sse4_1 not available... no test performed\n"); -} - -#else - -void qa_8s_convert_32f_unaligned16::t1() { - - volk_runtime_init(); - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - int8_t input0[vlen+1] __attribute__ ((aligned (16))); - - float output_generic[vlen+1] __attribute__ ((aligned (16))); - float output_sse4_1[vlen+1] __attribute__ ((aligned (16))); - - for(int i = 0; i < vlen; ++i) { - input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)) * 128.0)); - } - printf("8s_convert_32f_unaligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8s_convert_32f_unaligned16_manual(output_generic, &input0[1], 128.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_8s_convert_32f_unaligned16(output_sse4_1, &input0[1], 128.0, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%e...%e\n", output_generic[i], output_sse4_1[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); - } -} - -#endif diff --git a/volk/lib/qa_8s_convert_32f_unaligned16.h b/volk/lib/qa_8s_convert_32f_unaligned16.h deleted file mode 100644 index aad2f8c22..000000000 --- a/volk/lib/qa_8s_convert_32f_unaligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H -#define INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H - -#include -#include - -class qa_8s_convert_32f_unaligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_8s_convert_32f_unaligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc deleted file mode 100644 index f753e1107..000000000 --- a/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc +++ /dev/null @@ -1,68 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE4_1 - -void qa_8sc_deinterleave_16s_aligned16::t1() { - printf("sse4_1 not available... no test performed\n"); -} - -#else - -void qa_8sc_deinterleave_16s_aligned16::t1() { - - - volk_runtime_init(); - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - int16_t output_generic[vlen] __attribute__ ((aligned (16))); - int16_t output_generic1[vlen] __attribute__ ((aligned (16))); - int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); - int16_t output_sse4_11[vlen] __attribute__ ((aligned (16))); - - int8_t* loadInput = (int8_t*)input0; - for(int i = 0; i < vlen*2; ++i) { - loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); - } - printf("8sc_deinterleave_16s_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8sc_deinterleave_16s_aligned16_manual(output_generic, output_generic1, input0, vlen, "monkeys"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_8sc_deinterleave_16s_aligned16(output_sse4_1, output_sse4_11, input0, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4.1_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); - CPPUNIT_ASSERT_EQUAL(output_generic1[i], output_sse4_11[i]); - } -} - -#endif diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.h b/volk/lib/qa_8sc_deinterleave_16s_aligned16.h deleted file mode 100644 index 9c99fed70..000000000 --- a/volk/lib/qa_8sc_deinterleave_16s_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H -#define INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H - -#include -#include - -class qa_8sc_deinterleave_16s_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_16s_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc deleted file mode 100644 index 29073eed7..000000000 --- a/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc +++ /dev/null @@ -1,135 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE4_1 - -#ifndef LV_HAVE_SSE - -void qa_8sc_deinterleave_32f_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_8sc_deinterleave_32f_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_generic1[vlen] __attribute__ ((aligned (16))); - float output_sse[vlen] __attribute__ ((aligned (16))); - float output_sse1[vlen] __attribute__ ((aligned (16))); - - int8_t* loadInput = (int8_t*)input0; - for(int i = 0; i < vlen*2; ++i) { - loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); - } - printf("8sc_deinterleave_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 128.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8sc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, 128.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], fabs(output_generic[i])*1e-4); - } -} - -#endif /* LV_HAVE_SSE */ - -#else - -void qa_8sc_deinterleave_32f_aligned16::t1() { - - - volk_runtime_init(); - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_generic1[vlen] __attribute__ ((aligned (16))); - float output_sse[vlen] __attribute__ ((aligned (16))); - float output_sse1[vlen] __attribute__ ((aligned (16))); - float output_sse4_1[vlen] __attribute__ ((aligned (16))); - float output_sse14_1[vlen] __attribute__ ((aligned (16))); - - int8_t* loadInput = (int8_t*)input0; - for(int i = 0; i < vlen*2; ++i) { - loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); - } - printf("8sc_deinterleave_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 128.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8sc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, 128.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_8sc_deinterleave_32f_aligned16(output_sse4_1, output_sse14_1, input0, 128.0, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4.1_time: %f\n", total); - - for(int i = 0; i < vlen; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("%d generic... %e %e, sse... %e %e sse4.1... %e %e\n", i, output_generic[i], output_generic1[i], output_sse[i], output_sse1[i], output_sse4_1[i], output_sse14_1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i],std::max((output_generic[i])*1e-4, 1e-4)); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], std::max((output_generic[i])*1e-4, 1e-4)); - - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], std::max((output_generic[i])*1e-4, 1e-4)); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse14_1[i], std::max((output_generic[i])*1e-4, 1e-4)); - } -} - - -#endif /* LV_HAVE_SSE4_1 */ diff --git a/volk/lib/qa_8sc_deinterleave_32f_aligned16.h b/volk/lib/qa_8sc_deinterleave_32f_aligned16.h deleted file mode 100644 index 63b5fdadb..000000000 --- a/volk/lib/qa_8sc_deinterleave_32f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H -#define INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H - -#include -#include - -class qa_8sc_deinterleave_32f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_32f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc deleted file mode 100644 index 4980c982a..000000000 --- a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc +++ /dev/null @@ -1,65 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE4_1 - -void qa_8sc_deinterleave_real_16s_aligned16::t1() { - printf("sse4_1 not available... no test performed\n"); -} - -#else - -void qa_8sc_deinterleave_real_16s_aligned16::t1() { - - - volk_runtime_init(); - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - int16_t output_generic[vlen] __attribute__ ((aligned (16))); - int16_t output_sse4_1[vlen] __attribute__ ((aligned (16))); - - int8_t* loadInput = (int8_t*)input0; - for(int i = 0; i < vlen*2; ++i) { - loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); - } - printf("8sc_deinterleave_real_16s_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8sc_deinterleave_real_16s_aligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_8sc_deinterleave_real_16s_aligned16(output_sse4_1, input0, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4.1_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]); - } -} - -#endif diff --git a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h deleted file mode 100644 index 02050926f..000000000 --- a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H -#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H - -#include -#include - -class qa_8sc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_16s_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc deleted file mode 100644 index 3c3f737a1..000000000 --- a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc +++ /dev/null @@ -1,139 +0,0 @@ -#include -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSE4_1 - -#ifndef LV_HAVE_SSE - -void qa_8sc_deinterleave_real_32f_aligned16::t1() { - printf("sse not available... no test performed\n"); -} - -#else - -void qa_8sc_deinterleave_real_32f_aligned16::t1() { - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - float output_generic[vlen] __attribute__ ((aligned (16))); - float output_sse[vlen] __attribute__ ((aligned (16))); - - int8_t* loadInput = (int8_t*)input0; - for(int i = 0; i < vlen*2; ++i) { - loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); - } - printf("8sc_deinterleave_real_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); - } -} - -#endif /* LV_HAVE_SSE */ - -#else - -void qa_8sc_deinterleave_real_32f_aligned16::t1() { - - - volk_runtime_init(); - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex *input0; - - float* output_generic; - float* output_sse; - float* output_sse4_1; - - ret = posix_memalign((void**)&input0, 16, 2*vlen * sizeof(int8_t)); - ret = posix_memalign((void**)&output_generic, 16, vlen * sizeof(float)); - ret = posix_memalign((void**)&output_sse, 16, vlen * sizeof(float)); - ret = posix_memalign((void**)&output_sse4_1, 16, vlen * sizeof(float)); - - int8_t* loadInput = (int8_t*)input0; - for(int i = 0; i < vlen*2; ++i) { - loadInput[i] =((char)(((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0); - } - - printf("8sc_deinterleave_real_32f_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 1288.0, vlen, "sse"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse_time: %f\n", total); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_8sc_deinterleave_real_32f_aligned16(output_sse4_1, input0, 128.0, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4); - CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4); - } - - free(input0); - free(output_generic); - free(output_sse); - free(output_sse4_1); -} - -#endif /* LV_HAVE_SSE4_1 */ diff --git a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h deleted file mode 100644 index 93338e488..000000000 --- a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H -#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H - -#include -#include - -class qa_8sc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_32f_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc deleted file mode 100644 index a33d1bf30..000000000 --- a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc +++ /dev/null @@ -1,61 +0,0 @@ -#include -#include -#include -#include -#include - -//test for sse - -#ifndef LV_HAVE_SSSE3 - -void qa_8sc_deinterleave_real_8s_aligned16::t1() { - printf("ssse3 not available... no test performed\n"); -} - -#else - -void qa_8sc_deinterleave_real_8s_aligned16::t1() { - - volk_environment_init(); - clock_t start, end; - double total; - const int vlen = 3201; - const int ITERS = 100000; - std::complex input0[vlen] __attribute__ ((aligned (16))); - - int8_t output_generic[vlen] __attribute__ ((aligned (16))); - int8_t output_ssse3[vlen] __attribute__ ((aligned (16))); - - int8_t* loadInput = (int8_t*)input0; - for(int i = 0; i < vlen*2; ++i) { - loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); - } - printf("8sc_deinterleave_real_8s_aligned\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8sc_deinterleave_real_8s_aligned16_manual(output_generic, input0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("ssse3_time: %f\n", total); - - for(int i = 0; i < 1; ++i) { - //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); - //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); - } - - for(int i = 0; i < vlen; ++i) { - //printf("%d...%d\n", output0[i], output01[i]); - CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]); - } -} - -#endif diff --git a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h deleted file mode 100644 index 92fc0dd4a..000000000 --- a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H -#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H - -#include -#include - -class qa_8sc_deinterleave_real_8s_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_8s_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc deleted file mode 100644 index 216bf1cef..000000000 --- a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc +++ /dev/null @@ -1,87 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#define assertcomplexEqual(expected, actual, delta) \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); - -#define ERR_DELTA (1e-4) - -#ifndef LV_HAVE_SSE4_1 - -void qa_8sc_multiply_conjugate_16sc_aligned16::t1() { - printf("sse4.1 not available... no test performed\n"); -} - -#else - -void qa_8sc_multiply_conjugate_16sc_aligned16::t1() { - - - volk_runtime_init(); - - const int vlen = 2046; - const int ITERS = 100000; - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - std::complex* input; - std::complex* taps; - - std::complex* result_generic; - std::complex* result_sse4_1; - int i; - int8_t* inputInt8_T; - int8_t* tapsInt8_T; - - ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(int8_t)); - ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(int8_t)); - ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(int16_t)); - ret = posix_memalign((void**)&result_sse4_1, 16, vlen*2*sizeof(int16_t)); - - inputInt8_T = (int8_t*)input; - tapsInt8_T = (int8_t*)taps; - for(int i = 0; i < vlen*2; ++i) { - inputInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); - tapsInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); - } - - printf("8sc_multiply_conjugate_16sc_aligned16\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8sc_multiply_conjugate_16sc_aligned16_manual((std::complex*)result_generic, (std::complex*)input, (std::complex*)taps, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_8sc_multiply_conjugate_16sc_aligned16((std::complex*)result_sse4_1, (std::complex*)input, (std::complex*)taps, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - for(i = 0; i < vlen; i++){ - //printf("%d %d+%di %d+%di -> %d+%di %d+%di\n", i, std::real(input[i]), std::imag(input[i]), std::real(taps[i]), std::imag(taps[i]), std::real(result_generic[i]), std::imag(result_generic[i]), std::real(result_sse4_1[i]), std::imag(result_sse4_1[i])); - - assertcomplexEqual(result_generic[i], result_sse4_1[i], ERR_DELTA); - } - - free(input); - free(taps); - free(result_generic); - free(result_sse4_1); - -} - -#endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h deleted file mode 100644 index 0e78a5eca..000000000 --- a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H -#define INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H - -#include -#include - -class qa_8sc_multiply_conjugate_16sc_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_8sc_multiply_conjugate_16sc_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H */ diff --git a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc deleted file mode 100644 index 4c707446e..000000000 --- a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc +++ /dev/null @@ -1,87 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#define assertcomplexEqual(expected, actual, delta) \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); - -#define ERR_DELTA (1e-4) - -#ifndef LV_HAVE_SSE4_1 - -void qa_8sc_multiply_conjugate_32fc_aligned16::t1() { - printf("sse4.1 not available... no test performed\n"); -} - -#else - -void qa_8sc_multiply_conjugate_32fc_aligned16::t1() { - - - volk_runtime_init(); - - const int vlen = 2046; - const int ITERS = 100000; - - volk_environment_init(); - int ret; - clock_t start, end; - double total; - std::complex* input; - std::complex* taps; - - std::complex* result_generic; - std::complex* result_sse4_1; - int i; - int8_t* inputInt8_T; - int8_t* tapsInt8_T; - - ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(int8_t)); - ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(int8_t)); - ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float)); - ret = posix_memalign((void**)&result_sse4_1, 16, vlen*2*sizeof(float)); - - - inputInt8_T = (int8_t*)input; - tapsInt8_T = (int8_t*)taps; - for(int i = 0; i < vlen*2; ++i) { - inputInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); - tapsInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))) * 128.0)); - } - - printf("8sc_multiply_conjugate_32fc_aligned16\n"); - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - volk_8sc_multiply_conjugate_32fc_aligned16_manual(result_generic, (const std::complex*)input, (const std::complex*)taps, 32768.0, vlen, "generic"); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("generic_time: %f\n", total); - - - start = clock(); - for(int count = 0; count < ITERS; ++count) { - get_volk_runtime()->volk_8sc_multiply_conjugate_32fc_aligned16(result_sse4_1, (const std::complex*)input, (const std::complex*)taps, 32768.0, vlen); - } - end = clock(); - total = (double)(end-start)/(double)CLOCKS_PER_SEC; - printf("sse4_1_time: %f\n", total); - - for(i = 0; i < vlen; i++){ - //printf("%d %d+%di %d+%di -> %e+%ei %e+%ei\n", i, std::real(input[i]), std::imag(input[i]), std::real(taps[i]), std::imag(taps[i]), std::real(result_generic[i]), std::imag(result_generic[i]), std::real(result_sse4_1[i]), std::imag(result_sse4_1[i])); - assertcomplexEqual(result_generic[i], result_sse4_1[i], ERR_DELTA); - } - - free(input); - free(taps); - free(result_generic); - free(result_sse4_1); - -} - -#endif /*LV_HAVE_SSE4_1*/ diff --git a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h deleted file mode 100644 index eb9ae309c..000000000 --- a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H -#define INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H - -#include -#include - -class qa_8sc_multiply_conjugate_32fc_aligned16 : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_8sc_multiply_conjugate_32fc_aligned16); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H */ diff --git a/volk/lib/qa_volk.cc b/volk/lib/qa_volk.cc deleted file mode 100644 index 8e7e59768..000000000 --- a/volk/lib/qa_volk.cc +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright 2008 Free Software Foundation, Inc. - * - * This file is part of GNU Radio - * - * GNU Radio is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3, or (at your option) - * any later version. - * - * GNU Radio is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Radio; see the file COPYING. If not, write to - * the Free Software Foundation, Inc., 51 Franklin Street, - * Boston, MA 02110-1301, USA. - */ - -/* - * This class gathers together all the test cases for the example - * directory into a single test suite. As you create new test cases, - * add them here. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -CppUnit::TestSuite * -qa_volk::suite() -{ - CppUnit::TestSuite *s = new CppUnit::TestSuite("volk"); - - s->addTest(qa_16s_quad_max_star_aligned16::suite()); - s->addTest(qa_32fc_dot_prod_aligned16::suite()); - s->addTest(qa_32fc_square_dist_scalar_mult_aligned16::suite()); - s->addTest(qa_32fc_square_dist_aligned16::suite()); - s->addTest(qa_32f_sum_of_poly_aligned16::suite()); - s->addTest(qa_32fc_index_max_aligned16::suite()); - s->addTest(qa_32f_index_max_aligned16::suite()); - s->addTest(qa_32fc_conjugate_dot_prod_aligned16::suite()); - s->addTest(qa_16s_permute_and_scalar_add_aligned16::suite()); - s->addTest(qa_16s_branch_4_state_8_aligned16::suite()); - s->addTest(qa_16s_max_star_horizontal_aligned16::suite()); - s->addTest(qa_16s_max_star_aligned16::suite()); - s->addTest(qa_16s_add_quad_aligned16::suite()); - s->addTest(qa_32f_add_aligned16::suite()); - s->addTest(qa_32f_subtract_aligned16::suite()); - s->addTest(qa_32f_max_aligned16::suite()); - s->addTest(qa_32f_min_aligned16::suite()); - s->addTest(qa_64f_max_aligned16::suite()); - s->addTest(qa_64f_min_aligned16::suite()); - s->addTest(qa_32s_and_aligned16::suite()); - s->addTest(qa_32s_or_aligned16::suite()); - s->addTest(qa_32f_dot_prod_aligned16::suite()); - s->addTest(qa_32f_dot_prod_unaligned16::suite()); - s->addTest(qa_32f_fm_detect_aligned16::suite()); - //s->addTest(qa_32fc_32f_multiply_aligned16::suite()); - s->addTest(qa_32fc_multiply_aligned16::suite()); - s->addTest(qa_32f_divide_aligned16::suite()); - s->addTest(qa_32f_multiply_aligned16::suite()); - s->addTest(qa_32f_sqrt_aligned16::suite()); - s->addTest(qa_8sc_multiply_conjugate_16sc_aligned16::suite()); - s->addTest(qa_8sc_multiply_conjugate_32fc_aligned16::suite()); - s->addTest(qa_32u_popcnt_aligned16::suite()); - s->addTest(qa_64u_popcnt_aligned16::suite()); - s->addTest(qa_16u_byteswap_aligned16::suite()); - s->addTest(qa_32u_byteswap_aligned16::suite()); - s->addTest(qa_64u_byteswap_aligned16::suite()); - s->addTest(qa_32f_normalize_aligned16::suite()); - s->addTest(qa_16sc_deinterleave_16s_aligned16::suite()); - s->addTest(qa_16sc_deinterleave_32f_aligned16::suite()); - s->addTest(qa_16sc_deinterleave_real_16s_aligned16::suite()); - s->addTest(qa_16sc_deinterleave_real_32f_aligned16::suite()); - s->addTest(qa_16sc_deinterleave_real_8s_aligned16::suite()); - s->addTest(qa_16sc_magnitude_16s_aligned16::suite()); - s->addTest(qa_16sc_magnitude_32f_aligned16::suite()); - s->addTest(qa_32fc_deinterleave_32f_aligned16::suite()); - s->addTest(qa_32fc_deinterleave_64f_aligned16::suite()); - s->addTest(qa_32fc_deinterleave_real_16s_aligned16::suite()); - s->addTest(qa_32fc_deinterleave_real_32f_aligned16::suite()); - s->addTest(qa_32fc_deinterleave_real_64f_aligned16::suite()); - s->addTest(qa_32fc_magnitude_16s_aligned16::suite()); - s->addTest(qa_32fc_magnitude_32f_aligned16::suite()); - s->addTest(qa_32f_interleave_16sc_aligned16::suite()); - s->addTest(qa_32f_interleave_32fc_aligned16::suite()); - s->addTest(qa_8sc_deinterleave_16s_aligned16::suite()); - s->addTest(qa_8sc_deinterleave_32f_aligned16::suite()); - s->addTest(qa_8sc_deinterleave_real_16s_aligned16::suite()); - s->addTest(qa_8sc_deinterleave_real_32f_aligned16::suite()); - s->addTest(qa_8sc_deinterleave_real_8s_aligned16::suite()); - s->addTest(qa_16s_convert_32f_aligned16::suite()); - s->addTest(qa_16s_convert_32f_unaligned16::suite()); - s->addTest(qa_16s_convert_8s_aligned16::suite()); - s->addTest(qa_16s_convert_8s_unaligned16::suite()); - s->addTest(qa_32f_convert_16s_aligned16::suite()); - s->addTest(qa_32f_convert_16s_unaligned16::suite()); - s->addTest(qa_32f_convert_32s_aligned16::suite()); - s->addTest(qa_32f_convert_32s_unaligned16::suite()); - s->addTest(qa_32f_convert_64f_aligned16::suite()); - s->addTest(qa_32f_convert_64f_unaligned16::suite()); - s->addTest(qa_32f_convert_8s_aligned16::suite()); - s->addTest(qa_32f_convert_8s_unaligned16::suite()); - s->addTest(qa_32s_convert_32f_aligned16::suite()); - s->addTest(qa_32s_convert_32f_unaligned16::suite()); - s->addTest(qa_64f_convert_32f_aligned16::suite()); - s->addTest(qa_64f_convert_32f_unaligned16::suite()); - s->addTest(qa_8s_convert_16s_aligned16::suite()); - s->addTest(qa_8s_convert_16s_unaligned16::suite()); - s->addTest(qa_8s_convert_32f_aligned16::suite()); - s->addTest(qa_8s_convert_32f_unaligned16::suite()); - s->addTest(qa_32fc_32f_power_32fc_aligned16::suite()); - s->addTest(qa_32f_power_aligned16::suite()); - s->addTest(qa_32fc_atan2_32f_aligned16::suite()); - s->addTest(qa_32fc_power_spectral_density_32f_aligned16::suite()); - s->addTest(qa_32fc_power_spectrum_32f_aligned16::suite()); - s->addTest(qa_32f_calc_spectral_noise_floor_aligned16::suite()); - s->addTest(qa_32f_accumulator_aligned16::suite()); - s->addTest(qa_32f_stddev_aligned16::suite()); - s->addTest(qa_32f_stddev_and_mean_aligned16::suite()); - - return s; -} diff --git a/volk/lib/qa_volk.h b/volk/lib/qa_volk.h deleted file mode 100644 index 43fa7faba..000000000 --- a/volk/lib/qa_volk.h +++ /dev/null @@ -1,36 +0,0 @@ -/* -*- c++ -*- */ -/* - * Copyright 2008 Free Software Foundation, Inc. - * - * This file is part of GNU Radio - * - * GNU Radio is free software; you can redistribute it and/or modify - * it under the terms of the GNU Example Public License as published by - * the Free Software Foundation; either version 3, or (at your option) - * any later version. - * - * GNU Radio is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Example Public License for more details. - * - * You should have received a copy of the GNU Example Public License - * along with GNU Radio; see the file COPYING. If not, write to - * the Free Software Foundation, Inc., 51 Franklin Street, - * Boston, MA 02110-1301, USA. - */ - -#ifndef INCLUDED_QA_VOLK_H -#define INCLUDED_QA_VOLK_H - -#include - -//! collect all the tests for the example directory - -class qa_volk { - public: - //! return suite of tests for all of example directory - static CppUnit::TestSuite *suite (); -}; - -#endif /* INCLUDED_QA_VOLK_H */ diff --git a/volk/lib/test_all.cc b/volk/lib/test_all.cc deleted file mode 100644 index 50ac08eab..000000000 --- a/volk/lib/test_all.cc +++ /dev/null @@ -1,82 +0,0 @@ -/* -*- c++ -*- */ -/* - * Copyright 2002,2008 Free Software Foundation, Inc. - * - * This file is part of GNU Radio - * - * GNU Radio is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3, or (at your option) - * any later version. - * - * GNU Radio is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Radio; see the file COPYING. If not, write to - * the Free Software Foundation, Inc., 51 Franklin Street, - * Boston, MA 02110-1301, USA. - */ - -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -int -main (int argc, char **argv) -{ - - int opt = 0; - std::string xmlOutputFile(""); - - while( (opt = getopt(argc, argv, "o:")) != -1){ - switch(opt){ - case 'o': - if(optarg){ - xmlOutputFile.assign(optarg); - } - else{ - std::cerr << "No xml file output specified for -o" << std::endl; - exit(EXIT_FAILURE); - } - break; - - default: /* '?' */ - fprintf(stderr, "Usage: %s [-o] \"xml output file\"\n", - argv[0]); - exit(EXIT_FAILURE); - } - - } - - CppUnit::TextUi::TestRunner runner; - - runner.addTest (qa_volk::suite ()); - - bool was_successful = false; - if(!xmlOutputFile.empty()){ - std::ofstream xmlOutput(xmlOutputFile.c_str()); - if(xmlOutput.is_open()){ - runner.setOutputter(new CppUnit::XmlOutputter(&runner.result(), xmlOutput)); - - was_successful = runner.run("", false, true, false); - } - xmlOutput.close(); - } - else{ - was_successful = runner.run ("", false); - } - - return was_successful ? 0 : 1; -} -- cgit From f832c9789be9fec46e211be4fb2355013d19c000 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Fri, 21 Jan 2011 18:24:02 -0800 Subject: Volk: Small changes to speed things up. --- volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h | 4 ++-- volk/include/volk/volk_32f_s32f_convert_8i_a16.h | 2 +- volk/lib/qa_utils.cc | 2 +- volk/lib/testqa.cc | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'volk') diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h index cfbebd57b..55a25702e 100644 --- a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h +++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h @@ -53,7 +53,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, con number = sixteenthPoints * 16; int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr; for(; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256)); + *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8)); int16ComplexVectorPtr++; } } @@ -71,7 +71,7 @@ static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, c int16_t* complexVectorPtr = (int16_t*)complexVector; int8_t* iBufferPtr = iBuffer; for(number = 0; number < num_points; number++){ - *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ / 256)); + *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8)); complexVectorPtr++; } } diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h index c91448951..f64f2a213 100644 --- a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h +++ b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h @@ -106,7 +106,7 @@ static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, co unsigned int number = 0; for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ * scalar)); + *outputVectorPtr++ = (int8_t)(*inputVectorPtr++ * scalar); } } #endif /* LV_HAVE_GENERIC */ diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index 67ce5ddef..9cafd459f 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -32,7 +32,7 @@ void load_random_data(void *data, volk_type_t type, unsigned int n) { if(type.size == 8) random_floats((double *)data, n); else random_floats((float *)data, n); } else { - float int_max = pow(2, type.size*8); + float int_max = float(uint64_t(2) << (type.size*8)); if(type.is_signed) int_max /= 2.0; for(int i=0; i((RAND_MAX/2))) * int_max; diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 9f4934dc0..4cef7b443 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -40,7 +40,7 @@ BOOST_AUTO_TEST_CASE(volk_test_all) { VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a16, 1e-4, 0, 2046, 10000); VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000); VOLK_RUN_TESTS(volk_32fc_index_max_16u_a16, 0, 0, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a16, 0, 32768, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a16, 1, 32768, 2046, 10000); VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a16, 1e-4, 0, 2046, 10000); VOLK_RUN_TESTS(volk_32fc_x2_multiply_32fc_a16, 1e-4, 0, 2046, 10000); VOLK_RUN_TESTS(volk_32f_s32f_convert_16i_a16, 1, 32768, 2046, 10000); -- cgit From b0a23e876fe0f92afb2c55fd4fbce6427e9598d8 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 25 Jan 2011 15:06:23 -0800 Subject: Volk: doesn't test a routine if no valid architectures other than generic are found --- volk/lib/qa_utils.cc | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'volk') diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index 9cafd459f..6a6f87d85 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -288,6 +288,11 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, //first let's get a list of available architectures for the test std::vector arch_list = get_arch_list(archs); + if(arch_list.size() < 2) { + std::cout << "no architectures to test" << std::endl; + return false; + } + //now we have to get a function signature by parsing the name std::vector inputsig, outputsig; get_signatures_from_name(inputsig, outputsig, name); -- cgit From e979880d446949b2d2a93087011579c383369819 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Thu, 13 Jan 2011 18:57:48 +0000 Subject: Volk: QA util has proper free(). --- volk/lib/qa_utils.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'volk') diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index 6a6f87d85..e85e2c1bc 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -309,10 +309,12 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, //for(int i=0; i inbuffs; - + std::vector free_buffs; //this is just a list of void*'s that i'll have to free later. + //we need it because we dupe void*s in test_data below. make_buffer_for_signature(inbuffs, inputsig, vlen); for(int i=0; i arch_buffs; for(int j=0; j. +# +# This macro calls: +# +# AC_SUBST(BOOST_CPPFLAGS) / AC_SUBST(BOOST_LDFLAGS) +# +# And sets: +# +# HAVE_BOOST +# +# LAST MODIFICATION +# +# 2008-04-12 +# +# COPYLEFT +# +# Copyright (c) 2008 Thomas Porschberg +# Copyright (c) 2008 Free Software Foundation, Inc. +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. + +AC_DEFUN([AX_BOOST_BASE], +[ +AC_REQUIRE([GR_LIB64]) +AC_ARG_WITH([boost], + AS_HELP_STRING([--with-boost@<:@=DIR@:>@], + [use boost (default is yes) - it is possible to specify the root directory for boost (optional)]), + [ + if test "$withval" = "no"; then + want_boost="no" + elif test "$withval" = "yes"; then + want_boost="yes" + ac_boost_path="" + else + want_boost="yes" + ac_boost_path="$withval" + fi + ], + [want_boost="yes"]) + + +AC_ARG_WITH([boost-libdir], + AS_HELP_STRING([--with-boost-libdir=LIB_DIR], + [Force given directory for boost libraries. Note that this + will overwrite library path detection, so use this parameter + only if default library detection fails and you know exactly + where your boost libraries are located.]), + [ + if test -d $withval + then + ac_boost_lib_path="$withval" + else + AC_MSG_ERROR(--with-boost-libdir expected directory name) + fi + ], + [ac_boost_lib_path=""] +) + +if test "x$want_boost" = "xyes"; then + boost_lib_version_req=ifelse([$1], ,1.20.0,$1) + boost_lib_version_req_shorten=`expr $boost_lib_version_req : '\([[0-9]]*\.[[0-9]]*\)'` + boost_lib_version_req_major=`expr $boost_lib_version_req : '\([[0-9]]*\)'` + boost_lib_version_req_minor=`expr $boost_lib_version_req : '[[0-9]]*\.\([[0-9]]*\)'` + boost_lib_version_req_sub_minor=`expr $boost_lib_version_req : '[[0-9]]*\.[[0-9]]*\.\([[0-9]]*\)'` + if test "x$boost_lib_version_req_sub_minor" = "x" ; then + boost_lib_version_req_sub_minor="0" + fi + WANT_BOOST_VERSION=`expr $boost_lib_version_req_major \* 100000 \+ $boost_lib_version_req_minor \* 100 \+ $boost_lib_version_req_sub_minor` + AC_MSG_CHECKING(for boost >= $boost_lib_version_req) + succeeded=no + + dnl first we check the system location for boost libraries + dnl this location ist chosen if boost libraries are installed with the --layout=system option + dnl or if you install boost with RPM + if test "$ac_boost_path" != ""; then + dnl Look first where we think they ought to be, accounting for a possible "64" suffix on lib. + dnl If that directory doesn't exist, fall back to the default behavior + if test -d "$ac_boost_path/lib${gr_libdir_suffix}"; then + BOOST_LDFLAGS="-L$ac_boost_path/lib${gr_libdir_suffix}" + else + BOOST_LDFLAGS="-L$ac_boost_path/lib" + fi + BOOST_CPPFLAGS="-I$ac_boost_path/include" + else + for ac_boost_path_tmp in /usr /usr/local /opt /opt/local ; do + if test -d "$ac_boost_path_tmp/include/boost" && test -r "$ac_boost_path_tmp/include/boost"; then + dnl Look first where we think they ought to be, accounting for a possible "64" suffix on lib. + dnl If that directory doesn't exist, fall back to the default behavior + if test -d "$ac_boost_path_tmp/lib${gr_libdir_suffix}"; then + BOOST_LDFLAGS="-L$ac_boost_path_tmp/lib${gr_libdir_suffix}" + else + BOOST_LDFLAGS="-L$ac_boost_path_tmp/lib" + fi + BOOST_CPPFLAGS="-I$ac_boost_path_tmp/include" + break; + fi + done + fi + + dnl overwrite ld flags if we have required special directory with + dnl --with-boost-libdir parameter + if test "$ac_boost_lib_path" != ""; then + BOOST_LDFLAGS="-L$ac_boost_lib_path" + fi + + CPPFLAGS_SAVED="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" + export CPPFLAGS + + LDFLAGS_SAVED="$LDFLAGS" + LDFLAGS="$LDFLAGS $BOOST_LDFLAGS" + export LDFLAGS + + AC_LANG_PUSH(C++) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + @%:@include + ]], [[ + #if BOOST_VERSION >= $WANT_BOOST_VERSION + // Everything is okay + #else + # error Boost version is too old + #endif + ]])],[AC_MSG_RESULT(yes) + succeeded=yes + found_system=yes + ], + []) + AC_LANG_POP([C++]) + CPPFLAGS="$CPPFLAGS_SAVED" + LDFLAGS="$LDFLAGS_SAVED" + + + dnl if we found no boost with system layout we search for boost libraries + dnl built and installed without the --layout=system option + if test "$succeeded" != "yes"; then + _version=0 + + if test "$ac_boost_path" != ""; then + path_list="$ac_boost_path" + else + path_list="/usr /usr/local /opt /opt/local" + fi + for ac_boost_path in $path_list ; do + if test -d "$ac_boost_path" && test -r "$ac_boost_path"; then + for i in `ls -d $ac_boost_path/include/boost-* 2>/dev/null`; do + _version_tmp=`echo $i | sed "s#$ac_boost_path##" | sed 's,/include/boost-,,; s,_,.,'` + V_CHECK=`expr $_version_tmp \> $_version` + if test "$V_CHECK" = "1" ; then + _version=$_version_tmp + best_path=$ac_boost_path + fi + done + fi + done + + VERSION_UNDERSCORE=`echo $_version | sed 's/\./_/'` + BOOST_CPPFLAGS="-I$best_path/include/boost-$VERSION_UNDERSCORE" + + if test "$ac_boost_lib_path" = ""; then + dnl Look first where we think they ought to be, accounting for a possible "64" suffix on lib. + dnl If that directory doesn't exist, fall back to the default behavior + if test -d "$best_path/lib${gr_libdir_suffix}"; then + BOOST_LDFLAGS="-L$best_path/lib${gr_libdir_suffix}" + else + BOOST_LDFLAGS="-L$best_path/lib" + fi + fi + + CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" + export CPPFLAGS + LDFLAGS="$LDFLAGS $BOOST_LDFLAGS" + export LDFLAGS + + AC_LANG_PUSH(C++) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + @%:@include + ]], [[ + #if BOOST_VERSION >= $WANT_BOOST_VERSION + // Everything is okay + #else + # error Boost version is too old + #endif + ]])],[AC_MSG_RESULT(yes) + succeeded=yes + found_system=yes + ], + []) + AC_LANG_POP([C++]) + CPPFLAGS="$CPPFLAGS_SAVED" + LDFLAGS="$LDFLAGS_SAVED" + fi + + if test "$succeeded" != "yes" ; then + AC_MSG_RESULT([no]) + if test "$_version" = "0" ; then + AC_MSG_ERROR([[we could not detect the boost libraries (version $boost_lib_version_req_shorten or higher). +If you are sure you have boost installed, then check your version number looking in .]]) + else + AC_MSG_ERROR([your boost libraries seem to old (version $_version).]) + fi + else + AC_SUBST(BOOST_CPPFLAGS) + AC_SUBST(BOOST_LDFLAGS) + AC_DEFINE(HAVE_BOOST,1,[Define if the Boost headers are available]) + fi +fi +]) + +dnl +dnl Macros used by the boost items that need libraries. +dnl + +dnl $1 is unit name. E.g., boost_thread +AC_DEFUN([_AX_BOOST_CHECK_LIB],[ + _AX_BOOST_CHECK_LIB_($1,HAVE_[]m4_toupper($1),m4_toupper($1)_LIB) +]) + +dnl $1 is unit name. E.g., boost_thread +dnl $2 is AC_DEFINE name. E.g., HAVE_BOOST_THREAD +dnl $3 is lib var name. E.g., BOOST_THREAD_LIB +AC_DEFUN([_AX_BOOST_CHECK_LIB_],[ + AC_LANG_PUSH([C++]) + AC_DEFINE($2,1,[Define if the $1 library is available]) + BOOSTLIBDIR=`echo $BOOST_LDFLAGS | sed -e 's/@<:@^\/@:>@*//'` + + dnl See if we can find a usable library + link_ok="no" + if test "$ax_boost_user_lib" != ""; then + dnl use what the user supplied + for ax_lib in $ax_boost_user_lib $1-${ax_boost_user_lib}; do + AC_CHECK_LIB($ax_lib, exit, + [$3="-l$ax_lib"; AC_SUBST($3) link_ok="yes"; break]) + done + else + dnl Look in BOOSTLIBDIR for possible candidates + head=$BOOSTLIBDIR/lib[]$1 + for f in ${head}*.so* ${head}*.a* ${head}*.dll* ${head}*.dylib; do + dnl echo 1: $f + case $f in + *\**) continue;; + esac + f=`echo $f | sed -e 's,.*/,,' -e 's,^lib,,'` + dnl echo 2: $f + f=`echo $f | sed -e 's,\($1.*\)\.so.*$,\1,' -e 's,\($1.*\)\.a.*$,\1,' -e 's,\($1.*\)\.dll.*$,\1,' -e 's,\($1.*\)\.dylib.*$,\1,'` + dnl echo 3: $f + + ax_lib=$f + AC_CHECK_LIB($ax_lib, exit, + [$3="-l$ax_lib"; AC_SUBST($3) link_ok="yes"; break]) + done + fi + + if test "$link_ok" != "yes"; then + AC_MSG_ERROR([Could not link against lib[$1]!]) + fi + AC_LANG_POP([C++]) +]) + + +dnl $1 is unit name. E.g., boost_thread +AC_DEFUN([_AX_BOOST_WITH],[ + _AX_BOOST_WITH_($1,m4_bpatsubst($1,_,-)) +]) + +dnl $1 is unit name. E.g., boost_thread +dnl $2 is hyphenated unit name. E.g., boost-thread +AC_DEFUN([_AX_BOOST_WITH_],[ + AC_ARG_WITH([$2], + AC_HELP_STRING([--with-$2@<:@=special-lib@:>@], + [Use the m4_substr($1,6) library from boost. It is possible to specify a certain + library to the linker. E.g., --with-$2=$1-gcc41-mt-1_35]), + [ + if test "$withval" = "no"; then + want_boost="no" + elif test "$withval" = "yes"; then + want_boost="yes" + ax_boost_user_lib="" + else + want_boost="yes" + ax_boost_user_lib="$withval" + fi + ], + [want_boost="yes"]) +]) + +dnl $1 is unit name. E.g., boost_thread +dnl $2 is AC_LANG_PROGRAM argument 1 +dnl $3 is AC_LANG_PROGRAM argument 2 +dnl $4 is cv variable name. E.g., ax_cv_boost_thread +AC_DEFUN([_AX_BOOST_CHECK_],[ + _AX_BOOST_WITH($1) + if test "$want_boost" = "yes"; then + AC_REQUIRE([AC_PROG_CC]) + AC_REQUIRE([AC_PROG_CXX]) + CPPFLAGS_SAVED="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" + LDFLAGS_SAVED="$LDFLAGS" + LDFLAGS="$LDFLAGS $BOOST_LDFLAGS" + AC_CACHE_CHECK([whether the boost::m4_substr([$1],6) includes are available], [$4], + [AC_LANG_PUSH([C++]) + AC_COMPILE_IFELSE(AC_LANG_PROGRAM([$2],[$3]),[$4]=yes,[$4]=no) + AC_LANG_POP([C++]) + ]) + if test "$[$4]" = "yes"; then + _AX_BOOST_CHECK_LIB([$1]) + fi + CPPFLAGS="$CPPFLAGS_SAVED" + LDFLAGS="$LDFLAGS_SAVED" + fi +]) + +dnl $1 is unit name. E.g., boost_thread +dnl $2 is AC_LANG_PROGRAM argument 1 +dnl $3 is AC_LANG_PROGRAM argument 2 +AC_DEFUN([_AX_BOOST_CHECK],[ + _AX_BOOST_CHECK_($1,$2,$3,ax_cv_$1) +]) diff --git a/volk/config/ax_boost_unit_test_framework.m4 b/volk/config/ax_boost_unit_test_framework.m4 new file mode 100644 index 000000000..73affccfd --- /dev/null +++ b/volk/config/ax_boost_unit_test_framework.m4 @@ -0,0 +1,36 @@ +# +# SYNOPSIS +# +# AX_BOOST_UNIT_TEST_FRAMEWORK +# +# DESCRIPTION +# +# Test for Unit_Test_Framework library from the Boost C++ libraries. The +# macro requires a preceding call to AX_BOOST_BASE. +# +# This macro calls: +# +# AC_SUBST(BOOST_UNIT_TEST_FRAMEWORK_LIB) +# +# And sets: +# +# HAVE_BOOST_UNIT_TEST_FRAMEWORK +# +# COPYLEFT +# +# Copyright (c) 2008 Thomas Porschberg +# Copyright (c) 2008 Free Software Foundation, Inc. +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. + +AC_DEFUN([AX_BOOST_UNIT_TEST_FRAMEWORK], +[ + AC_REQUIRE([AX_BOOST_BASE]) + _AX_BOOST_CHECK([boost_unit_test_framework], + [@%:@include ], + [using boost::unit_test::test_suite; + test_suite* test= BOOST_TEST_SUITE( "Unit test example 1" ); + return 0;]) +]) diff --git a/volk/configure.ac b/volk/configure.ac index 44a4c4258..eeb7441ba 100644 --- a/volk/configure.ac +++ b/volk/configure.ac @@ -43,7 +43,7 @@ dnl If you need additional boost libraries, you'll need to dnl uncomment AX_BOOST_BASE, plus some of the following: dnl dnl calls AC_SUBST(BOOST_CPPFLAGS), AC_SUBST(BOOST_LDFLAGS) and defines HAVE_BOOST -dnl AX_BOOST_BASE([1.35]) +AX_BOOST_BASE([1.35]) dnl dnl All the rest of these call AC_SUBST(BOOST__LIB) and define HAVE_BOOST_ dnl @@ -56,7 +56,7 @@ dnl AX_BOOST_SERIALIZATION dnl AX_BOOST_SIGNALS dnl AX_BOOST_SYSTEM dnl AX_BOOST_TEST_EXEC_MONITOR -dnl AX_BOOST_UNIT_TEST_FRAMEWORK +AX_BOOST_UNIT_TEST_FRAMEWORK dnl AX_BOOST_WSERIALIZATION AC_CONFIG_HEADERS([volk_config.h]) diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index bbc993fa2..afd29a352 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -121,7 +121,7 @@ noinst_PROGRAMS = \ testqa_SOURCES = testqa.cc qa_utils.cc testqa_CPPFLAGS = -DBOOST_TEST_DYN_LINK -DBOOST_TEST_MAIN -testqa_LDFLAGS = -lboost_unit_test_framework +testqa_LDFLAGS = $(BOOST_UNIT_TEST_FRAMEWORK_LIB) if LV_HAVE_ORC testqa_LDADD = \ libvolk.la \ -- cgit From 3024e95eeae2891b1cc575527bd09214a9493b3f Mon Sep 17 00:00:00 2001 From: root Date: Thu, 13 Jan 2011 20:42:32 +0000 Subject: Volk: fixed placeholder Orc impl for 32fc_x2_multiply_32fc --- volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc | 1 + 1 file changed, 1 insertion(+) (limited to 'volk') diff --git a/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc b/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc index d23892880..b72dfe8e7 100644 --- a/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc +++ b/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc @@ -1,6 +1,7 @@ .function volk_32fc_x2_multiply_32fc_a16_orc_impl .source 8 src1 .source 8 src2 +.floatparam 4 mask .dest 8 dst .temp 8 tmp x2 mulf dst, src1, src2 -- cgit From 81c3086bee1752c94a89ab2d20b7de048fdd1be7 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 14 Jan 2011 19:58:11 -0500 Subject: Cleans up the Makefiles for the various platforms. This should also make it easier to add new architectures. Thanks to Josh for the inspiration. --- volk/Makefile.am | 2 +- volk/Makefile.common | 30 +++++++++++++++++- volk/configure.ac | 9 +----- volk/include/volk/Makefile.am | 26 ++------------- volk/lib/Makefile.am | 74 ++++--------------------------------------- 5 files changed, 40 insertions(+), 101 deletions(-) (limited to 'volk') diff --git a/volk/Makefile.am b/volk/Makefile.am index 3521dd0e4..4c6951ca7 100644 --- a/volk/Makefile.am +++ b/volk/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright 2004,2008 Free Software Foundation, Inc. +# Copyright 2004,2008,2010 Free Software Foundation, Inc. # # This file is part of GNU Radio # diff --git a/volk/Makefile.common b/volk/Makefile.common index 083f6f710..daa8d78b6 100644 --- a/volk/Makefile.common +++ b/volk/Makefile.common @@ -1,6 +1,6 @@ # -*- Makefile -*- # -# Copyright 2010 Free Software Foundation, Inc. +# Copyright 2010,2011 Free Software Foundation, Inc. # # This file is part of GNU Radio # @@ -20,6 +20,33 @@ # Boston, MA 02110-1301, USA. # +if MD_CPU_generic + platform_CODE = \ + $(top_srcdir)/lib/volk_cpu_generic.c +endif + +if MD_CPU_x86 +if MD_SUBCPU_x86_64 + platform_CODE = \ + $(top_srcdir)/lib/volk_cpu_x86.c \ + $(top_srcdir)/lib/cpuid_x86_64.S +endif +endif + +if MD_CPU_x86 +if !MD_SUBCPU_x86_64 + platform_CODE = \ + $(top_srcdir)/lib/volk_cpu_x86.c \ + $(top_srcdir)/lib/cpuid_x86.S +endif +endif + +if MD_CPU_powerpc + platform_CODE = \ + $(top_srcdir)/lib/volk_cpu_powerpc.c +endif + + ourincludedir = $(includedir)/volk # swig includes @@ -44,3 +71,4 @@ STD_DEFINES_AND_INCLUDES=-I$(top_srcdir)/include -I$(top_srcdir)/lib $(GNURADIO # not. We define it now in configure.ac using AM_PATH_PROG, but now # here have to add a -f to be like GNU make. RM=$(RM_PROG) -f + diff --git a/volk/configure.ac b/volk/configure.ac index 8f17e5065..7cbcbad53 100644 --- a/volk/configure.ac +++ b/volk/configure.ac @@ -1,5 +1,5 @@ dnl -dnl Copyright 2010 Free Software Foundation, Inc. +dnl Copyright 2010,2011 Free Software Foundation, Inc. dnl dnl This program is free software: you can redistribute it and/or modify dnl it under the terms of the GNU General Public License as published by @@ -63,13 +63,6 @@ dnl AX_BOOST_WSERIALIZATION AC_CONFIG_HEADERS([volk_config.h]) LV_SET_SIMD_FLAGS -# FIXME: Not very extensible to supporting more processors easily -AM_CONDITIONAL([MYCPU_X86], [test "$MD_CPU" = "x86"]) -AM_CONDITIONAL([MYSUBCPU_X86], [test "$MD_SUBCPU" = "x86"]) -AM_CONDITIONAL([MYSUBCPU_X86_64], [test "$MD_SUBCPU" = "x86_64"]) -AM_CONDITIONAL([MYSUBCPU_POWERPC], [test "$MD_SUBCPU" = "powerpc"]) -AM_CONDITIONAL([MYSUBCPU_GENERIC], [test "$MD_SUBCPU" != "powerpc" && test "$MD_SUBCPU" != "x86" && test "$MD_SUBCPU" != "x86_64"]) - AC_CONFIG_FILES([\ Makefile \ config/Makefile \ diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 04a43bd34..658974d3a 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright 2010 Free Software Foundation, Inc. +# Copyright 2010,2011 Free Software Foundation, Inc. # # This file is part of GNU Radio # @@ -130,33 +130,11 @@ volkinclude_HEADERS = \ volk_8s_convert_32f_unaligned16.h VOLK_MKTABLES_SOURCES = \ + $(platform_CODE) \ $(top_srcdir)/lib/volk_rank_archs.c \ $(top_srcdir)/lib/volk_mktables.c -# FIXME: Not very extensible to supporting more processors easily -if MYSUBCPU_X86_64 - VOLK_MKTABLES_SOURCES += \ - $(top_srcdir)/lib/volk_cpu_x86.c \ - $(top_srcdir)/lib/cpuid_x86_64.S -endif - -if MYSUBCPU_X86 - VOLK_MKTABLES_SOURCES += \ - $(top_srcdir)/lib/volk_cpu_x86.c \ - $(top_srcdir)/lib/cpuid_x86.S -endif - -if MYSUBCPU_POWERPC - VOLK_MKTABLES_SOURCES += \ - $(top_srcdir)/lib/volk_cpu_powerpc.c -endif - -if MYSUBCPU_GENERIC - VOLK_MKTABLES_SOURCES += \ - $(top_srcdir)/lib/volk_cpu_generic.c -endif - volk_mktables$(EXEEXT): $(VOLK_MKTABLES_SOURCES) $(CC) -o $@ $^ $(AM_CPPFLAGS) -I$(top_builddir)/include diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 814d438fd..896d568e6 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -1,5 +1,5 @@ # -# Copyright 2010 Free Software Foundation, Inc. +# Copyright 2010,2011 Free Software Foundation, Inc. # # This file is part of GNU Radio # @@ -61,77 +61,17 @@ EXTRA_DIST = \ # The main library # ---------------------------------------------------------------- -universal_runtime_CODE = \ - volk_runtime.c \ - volk_init.c \ +libvolk_runtime_la_SOURCES = \ + $(platform_CODE) \ + volk_runtime.c \ + volk_init.c \ volk_rank_archs.c -universal_CODE = \ +libvolk_la_SOURCES = \ + $(platform_CODE) \ volk.c \ volk_environment_init.c -generic_CODE = \ - volk_cpu_generic.c - -x86_CODE = \ - volk_cpu_x86.c - -x86_SUBCODE = \ - cpuid_x86.S - -x86_64_SUBCODE = \ - cpuid_x86_64.S - -powerpc_CODE = \ - volk_cpu_powerpc.c - - -if MD_CPU_generic -libvolk_la_SOURCES = \ - $(generic_CODE) \ - $(universal_CODE) -libvolk_runtime_la_SOURCES = \ - $(generic_CODE) \ - $(universal_runtime_CODE) - -endif - -if MD_CPU_x86 -if MD_SUBCPU_x86_64 -libvolk_la_SOURCES = \ - $(x86_CODE) \ - $(x86_64_SUBCODE) \ - $(universal_CODE) - -libvolk_runtime_la_SOURCES = \ - $(x86_CODE) \ - $(x86_64_SUBCODE) \ - $(universal_runtime_CODE) -else -libvolk_la_SOURCES = \ - $(x86_CODE) \ - $(x86_SUBCODE) \ - $(universal_CODE) - -libvolk_runtime_la_SOURCES = \ - $(x86_CODE) \ - $(x86_SUBCODE) \ - $(universal_runtime_CODE) -endif -endif - - -if MD_CPU_powerpc -libvolk_la_SOURCES = \ - $(powerpc_CODE) \ - $(universal_CODE) - -libvolk_runtime_la_SOURCES = \ - $(powerpc_CODE) \ - $(universal_runtime_CODE) -endif - - libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 -- cgit From 74e9f2b0bce026b7b68a2d0a3558687386fdaab5 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 14 Jan 2011 23:18:07 -0500 Subject: Creates volk_cpu in a .cc file and externs it in the header. --- volk/include/volk/make_cpuid_h.py | 25 ++++++++++++++++++++++++- volk/include/volk/make_proccpu_sim.py | 23 +++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) (limited to 'volk') diff --git a/volk/include/volk/make_cpuid_h.py b/volk/include/volk/make_cpuid_h.py index 823e3b2c0..cd3da2455 100644 --- a/volk/include/volk/make_cpuid_h.py +++ b/volk/include/volk/make_cpuid_h.py @@ -1,3 +1,25 @@ +#!/usr/bin/env python +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + from xml.dom import minidom from emit_omnilog import * @@ -13,7 +35,8 @@ def make_cpuid_h(dom) : for domarch in dom: arch = str(domarch.attributes["name"].value); tempstring = tempstring + " int (*has_" + arch + ") ();\n"; - tempstring = tempstring + "}volk_cpu;\n\n"; + tempstring = tempstring + "};\n\n"; + tempstring = tempstring + "extern struct VOLK_CPU volk_cpu;\n\n"; tempstring = tempstring + "void volk_cpu_init ();\n" tempstring = tempstring + "unsigned int volk_get_lvarch ();\n" diff --git a/volk/include/volk/make_proccpu_sim.py b/volk/include/volk/make_proccpu_sim.py index c75a4d5fb..e8463f5be 100644 --- a/volk/include/volk/make_proccpu_sim.py +++ b/volk/include/volk/make_proccpu_sim.py @@ -1,3 +1,25 @@ +#!/usr/bin/env python +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + from xml.dom import minidom def make_proccpu_sim(dom) : @@ -6,6 +28,7 @@ def make_proccpu_sim(dom) : tempstring = tempstring + "#include \n" tempstring = tempstring + "#include \n" tempstring = tempstring + "\n\n" + tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" tempstring = tempstring + "void test_append(char* buf, int val, char* newkey){\n"; tempstring = tempstring + " if(val==1){\n"; -- cgit From 568d0cafb8423f34530c94963654abea4c62801b Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Fri, 14 Jan 2011 23:18:07 -0500 Subject: A fix for volk_cpu being a duplicate variable. Declared as extern in the header file volk_cpu.h and actually created in the C files. Note that this could be a problem if multiple architectures are ever (can ever?) be built at the same time. If that happens, we can move this variable declaration to another C file that is made common to all builds. --- volk/include/volk/make_cpuid_generic_c.py | 26 ++++++++++++++++++++++++-- volk/include/volk/make_cpuid_h.py | 25 ++++++++++++++++++++++++- volk/include/volk/make_cpuid_powerpc_c.py | 26 ++++++++++++++++++++++++-- volk/include/volk/make_cpuid_x86_c.py | 27 +++++++++++++++++++++++++-- 4 files changed, 97 insertions(+), 7 deletions(-) (limited to 'volk') diff --git a/volk/include/volk/make_cpuid_generic_c.py b/volk/include/volk/make_cpuid_generic_c.py index 3ba225fca..c682d4138 100644 --- a/volk/include/volk/make_cpuid_generic_c.py +++ b/volk/include/volk/make_cpuid_generic_c.py @@ -1,11 +1,33 @@ +#!/usr/bin/env python +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + from xml.dom import minidom def make_cpuid_generic_c(dom) : tempstring = ""; tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; tempstring = tempstring + "#include \n" - tempstring = tempstring + "#include \n" - tempstring = tempstring + "\n\n" + tempstring = tempstring + "#include \n\n" + tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" for domarch in dom: if str(domarch.attributes["type"].value) == "all": diff --git a/volk/include/volk/make_cpuid_h.py b/volk/include/volk/make_cpuid_h.py index 823e3b2c0..cd3da2455 100644 --- a/volk/include/volk/make_cpuid_h.py +++ b/volk/include/volk/make_cpuid_h.py @@ -1,3 +1,25 @@ +#!/usr/bin/env python +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + from xml.dom import minidom from emit_omnilog import * @@ -13,7 +35,8 @@ def make_cpuid_h(dom) : for domarch in dom: arch = str(domarch.attributes["name"].value); tempstring = tempstring + " int (*has_" + arch + ") ();\n"; - tempstring = tempstring + "}volk_cpu;\n\n"; + tempstring = tempstring + "};\n\n"; + tempstring = tempstring + "extern struct VOLK_CPU volk_cpu;\n\n"; tempstring = tempstring + "void volk_cpu_init ();\n" tempstring = tempstring + "unsigned int volk_get_lvarch ();\n" diff --git a/volk/include/volk/make_cpuid_powerpc_c.py b/volk/include/volk/make_cpuid_powerpc_c.py index 443a58488..0b0ea84e7 100644 --- a/volk/include/volk/make_cpuid_powerpc_c.py +++ b/volk/include/volk/make_cpuid_powerpc_c.py @@ -1,11 +1,33 @@ +#!/usr/bin/env python +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + from xml.dom import minidom def make_cpuid_powerpc_c(dom) : tempstring = ""; tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; tempstring = tempstring + "#include \n" - tempstring = tempstring + "#include \n" - tempstring = tempstring + "\n\n" + tempstring = tempstring + "#include \n\n" + tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" #just assume it has them for powerpc for domarch in dom: diff --git a/volk/include/volk/make_cpuid_x86_c.py b/volk/include/volk/make_cpuid_x86_c.py index 8ebe243e5..48a406fa4 100644 --- a/volk/include/volk/make_cpuid_x86_c.py +++ b/volk/include/volk/make_cpuid_x86_c.py @@ -1,11 +1,34 @@ +#!/usr/bin/env python +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + from xml.dom import minidom def make_cpuid_x86_c(dom) : tempstring = ""; tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; tempstring = tempstring + "#include \n" - tempstring = tempstring + "#include \n" - tempstring = tempstring + "\n\n" + tempstring = tempstring + "#include \n\n" + tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" + tempstring = tempstring + "extern void cpuid_x86 (unsigned int op, unsigned int result[4]);\n\n" tempstring = tempstring + "static inline unsigned int cpuid_eax(unsigned int op) {\n"; tempstring = tempstring + " unsigned int regs[4];\n" -- cgit From 2e8610450bc467400f8de2976cc6c1743025aff1 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sat, 15 Jan 2011 10:53:54 -0500 Subject: This should not have been put here. --- volk/include/volk/make_proccpu_sim.py | 1 - 1 file changed, 1 deletion(-) (limited to 'volk') diff --git a/volk/include/volk/make_proccpu_sim.py b/volk/include/volk/make_proccpu_sim.py index e8463f5be..029dacfcc 100644 --- a/volk/include/volk/make_proccpu_sim.py +++ b/volk/include/volk/make_proccpu_sim.py @@ -28,7 +28,6 @@ def make_proccpu_sim(dom) : tempstring = tempstring + "#include \n" tempstring = tempstring + "#include \n" tempstring = tempstring + "\n\n" - tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" tempstring = tempstring + "void test_append(char* buf, int val, char* newkey){\n"; tempstring = tempstring + " if(val==1){\n"; -- cgit From 48f9ca90e0cbcbfe67b0e10889c60928d9be5c49 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Mon, 17 Jan 2011 16:37:45 -0800 Subject: gr math simplification: Replaces gr_gcd, gr_isnan, and gr_signbit one-time instances with boot math calls. No point in wrapping these utility math functions into gnuradio when they are 1) provided by boost 2) only called once Removes gr_math.cc, and configure checks for isnan. --- volk/config/lf_cxx.m4 | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'volk') diff --git a/volk/config/lf_cxx.m4 b/volk/config/lf_cxx.m4 index dfc6bfbfe..7cce5f8a4 100644 --- a/volk/config/lf_cxx.m4 +++ b/volk/config/lf_cxx.m4 @@ -46,22 +46,5 @@ AC_DEFUN([LF_CXX_PORTABILITY],[ dnl Check for common C++ portability problems dnl - dnl AC_LANG_PUSH - dnl AC_LANG_CPLUSPLUS - AC_LANG_SAVE - AC_LANG_CPLUSPLUS - - - dnl Test whether C++ has std::isnan - AC_MSG_CHECKING(whether C++ has std::isnan) - AC_TRY_COMPILE([#include ], [ - std::isnan(0); -], [ AC_MSG_RESULT(yes) - AC_DEFINE(CXX_HAS_STD_ISNAN,[],[Define if has std::isnan]) ], - [ AC_MSG_RESULT(no) ]) - - dnl Done with the portability checks - dnl AC_LANG_POP([C++]) - AC_LANG_RESTORE ]) -- cgit From 9edf280fb25bee071c166123ac1aee41d4a4949e Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Sat, 15 Jan 2011 17:29:15 -0800 Subject: volk: replace assembly and separate cases with gcc cpuid for all x86 --- volk/Makefile.common | 13 +-- volk/include/volk/make_cpuid_x86_c.py | 3 +- volk/include/volk/make_set_simd.py | 33 ++----- volk/lib/Makefile.am | 2 +- volk/lib/assembly.h | 67 ------------- volk/lib/cpuid_x86.S | 60 ------------ volk/lib/cpuid_x86_64.S | 54 ----------- volk/lib/gcc_x86_cpuid.h | 178 ++++++++++++++++++++++++++++++++++ 8 files changed, 191 insertions(+), 219 deletions(-) delete mode 100644 volk/lib/assembly.h delete mode 100644 volk/lib/cpuid_x86.S delete mode 100644 volk/lib/cpuid_x86_64.S create mode 100644 volk/lib/gcc_x86_cpuid.h (limited to 'volk') diff --git a/volk/Makefile.common b/volk/Makefile.common index daa8d78b6..e91a55e9c 100644 --- a/volk/Makefile.common +++ b/volk/Makefile.common @@ -26,19 +26,8 @@ if MD_CPU_generic endif if MD_CPU_x86 -if MD_SUBCPU_x86_64 platform_CODE = \ - $(top_srcdir)/lib/volk_cpu_x86.c \ - $(top_srcdir)/lib/cpuid_x86_64.S -endif -endif - -if MD_CPU_x86 -if !MD_SUBCPU_x86_64 - platform_CODE = \ - $(top_srcdir)/lib/volk_cpu_x86.c \ - $(top_srcdir)/lib/cpuid_x86.S -endif + $(top_srcdir)/lib/volk_cpu_x86.c endif if MD_CPU_powerpc diff --git a/volk/include/volk/make_cpuid_x86_c.py b/volk/include/volk/make_cpuid_x86_c.py index 48a406fa4..2b2bd7c91 100644 --- a/volk/include/volk/make_cpuid_x86_c.py +++ b/volk/include/volk/make_cpuid_x86_c.py @@ -27,9 +27,10 @@ def make_cpuid_x86_c(dom) : tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n"; tempstring = tempstring + "#include \n" tempstring = tempstring + "#include \n\n" + tempstring = tempstring + "#include \n\n" tempstring = tempstring + "struct VOLK_CPU volk_cpu;\n\n" - tempstring = tempstring + "extern void cpuid_x86 (unsigned int op, unsigned int result[4]);\n\n" + tempstring = tempstring + "#define cpuid_x86(op, r) __get_cpuid(op, r+0, r+1, r+2, r+3)\n\n" tempstring = tempstring + "static inline unsigned int cpuid_eax(unsigned int op) {\n"; tempstring = tempstring + " unsigned int regs[4];\n" tempstring = tempstring + " cpuid_x86 (op, regs);\n" diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index 842366b18..275d3869f 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -27,30 +27,15 @@ def make_set_simd(dom) : tempstring = tempstring + " AC_MSG_CHECKING([proccpu])\n"; tempstring = tempstring + " case \"$MD_CPU\" in\n"; tempstring = tempstring + " (x86)\n"; - tempstring = tempstring + " case \"$MD_SUBCPU\" in\n"; - tempstring = tempstring + " (x86)\n"; - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I $srcdir/include/ -I$srcdir/lib $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_x86.c $srcdir/lib/cpuid_x86.S`\"\n"; - tempstring = tempstring + " then\n"; - tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; - tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; - tempstring = tempstring + " rm -f proccpu\n"; - tempstring = tempstring + " else\n"; - tempstring = tempstring + " AC_MSG_RESULT(no)\n"; - tempstring = tempstring + " lv_PROCCPU=no\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " ;;\n" - tempstring = tempstring + " (*)\n" - tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ -I$srcdir/lib $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_x86.c $srcdir/lib/cpuid_x86_64.S`\"\n"; - tempstring = tempstring + " then\n"; - tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; - tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; - tempstring = tempstring + " rm -f proccpu\n"; - tempstring = tempstring + " else\n"; - tempstring = tempstring + " AC_MSG_RESULT(no)\n"; - tempstring = tempstring + " lv_PROCCPU=no\n"; - tempstring = tempstring + " fi\n" - tempstring = tempstring + " ;;\n" - tempstring = tempstring + " esac\n" + tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ -I$srcdir/lib $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_x86.c 2>&1`\"\n"; + tempstring = tempstring + " then\n"; + tempstring = tempstring + " AC_MSG_RESULT(yes)\n"; + tempstring = tempstring + " lv_PROCCPU=\"`./proccpu`\"\n"; + tempstring = tempstring + " rm -f proccpu\n"; + tempstring = tempstring + " else\n"; + tempstring = tempstring + " AC_MSG_RESULT(no)\n"; + tempstring = tempstring + " lv_PROCCPU=no\n"; + tempstring = tempstring + " fi\n" tempstring = tempstring + " ;;\n"; tempstring = tempstring + " (powerpc)\n"; tempstring = tempstring + " if test -z \"`${CC} -o proccpu -I$srcdir/include/ $srcdir/lib/volk_proccpu_sim.c $srcdir/lib/volk_cpu_powerpc.c 2>&1`\"\n"; diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 896d568e6..7a355e86a 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -186,7 +186,7 @@ libvolk_qa_la_LIBADD = \ noinst_HEADERS = \ volk_init.h \ qa_volk.h \ - assembly.h \ + gcc_x86_cpuid.h \ qa_16s_quad_max_star_aligned16.h \ qa_32fc_dot_prod_aligned16.h \ qa_32fc_square_dist_aligned16.h \ diff --git a/volk/lib/assembly.h b/volk/lib/assembly.h deleted file mode 100644 index 8a99aa07c..000000000 --- a/volk/lib/assembly.h +++ /dev/null @@ -1,67 +0,0 @@ -/* -*- c++ -*- */ -/* - * Copyright 2002 Free Software Foundation, Inc. - * - * This file is part of GNU Radio - * - * GNU Radio is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3, or (at your option) - * any later version. - * - * GNU Radio is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Radio; see the file COPYING. If not, write to - * the Free Software Foundation, Inc., 51 Franklin Street, - * Boston, MA 02110-1301, USA. - */ - -#ifndef _ASSEMBLY_H_ -#define _ASSEMBLY_H_ - -#if defined (__APPLE__) && defined (__APPLE_CC__) - -// XCode ignores the .scl and .type functions in XCode 2.2.1 and 2.3, -// but creates an error in XCode 2.4. Just ignore them. - -#define GLOB_SYMB(f) _ ## f - -#define DEF_FUNC_HEAD(f) /* none */ - -#define FUNC_TAIL(f) /* none*/ - -#elif !defined (__ELF__) - -/* - * Too bad, the following define does not work as expected --SF - * #define GLOB_SYMB(f) __USER_LABEL_PREFIX__ ## f - */ -#define GLOB_SYMB(f) _ ## f - -#define DEF_FUNC_HEAD(f) \ - .def GLOB_SYMB(f); .scl 2; .type 32; .endef - -#define FUNC_TAIL(f) /* none */ - - -#else /* !__ELF__ */ - - -#define GLOB_SYMB(f) f - -#define DEF_FUNC_HEAD(f) \ - .type GLOB_SYMB(f),@function \ - -#define FUNC_TAIL(f) \ - .Lfe1: \ - .size GLOB_SYMB(f),.Lfe1-GLOB_SYMB(f) - - -#endif /* !__ELF__ */ - - -#endif /* _ASSEMBLY_H_ */ diff --git a/volk/lib/cpuid_x86.S b/volk/lib/cpuid_x86.S deleted file mode 100644 index 4e1a9404f..000000000 --- a/volk/lib/cpuid_x86.S +++ /dev/null @@ -1,60 +0,0 @@ -# -# Copyright 2003 Free Software Foundation, Inc. -# -# This file is part of GNU Radio -# -# GNU Radio is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Radio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Radio; see the file COPYING. If not, write to -# the Free Software Foundation, Inc., 51 Franklin Street, -# Boston, MA 02110-1301, USA. -# - -# -# execute CPUID instruction, return EAX, EBX, ECX and EDX values in result -# -# void cpuid_x86 (unsigned int op, unsigned int result[4]); -# - -#include "assembly.h" - -.file "cpuid_x86.S" - .version "01.01" -.text -.globl GLOB_SYMB(cpuid_x86) - DEF_FUNC_HEAD(cpuid_x86) -GLOB_SYMB(cpuid_x86): - pushl %ebp - movl %esp, %ebp - pushl %ebx # must save in PIC mode, holds GOT pointer - pushl %esi - - movl 8(%ebp), %eax # op - movl 12(%ebp), %esi # result - cpuid - movl %eax, 0(%esi) - movl %ebx, 4(%esi) - movl %ecx, 8(%esi) - movl %edx, 12(%esi) - - popl %esi - popl %ebx - popl %ebp - ret - -FUNC_TAIL(cpuid_x86) - .ident "Hand coded cpuid assembly" - - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif diff --git a/volk/lib/cpuid_x86_64.S b/volk/lib/cpuid_x86_64.S deleted file mode 100644 index 32b1847cd..000000000 --- a/volk/lib/cpuid_x86_64.S +++ /dev/null @@ -1,54 +0,0 @@ -# -# Copyright 2003,2005 Free Software Foundation, Inc. -# -# This file is part of GNU Radio -# -# GNU Radio is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# GNU Radio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with GNU Radio; see the file COPYING. If not, write to -# the Free Software Foundation, Inc., 51 Franklin Street, -# Boston, MA 02110-1301, USA. -# - -# -# execute CPUID instruction, return EAX, EBX, ECX and EDX values in result -# -# void cpuid_x86 (unsigned int op, unsigned int result[4]); -# - -#include "assembly.h" - -.file "cpuid_x86_64.S" - .version "01.01" -.text -.globl GLOB_SYMB(cpuid_x86) - DEF_FUNC_HEAD(cpuid_x86) -GLOB_SYMB(cpuid_x86): - mov %rbx, %r11 # must save in PIC mode, holds GOT pointer - - mov %rdi, %rax # op - cpuid - movl %eax, 0(%rsi) # result - movl %ebx, 4(%rsi) - movl %ecx, 8(%rsi) - movl %edx, 12(%rsi) - - mov %r11, %rbx - retq - -FUNC_TAIL(cpuid_x86) - .ident "Hand coded cpuid64 assembly" - - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif diff --git a/volk/lib/gcc_x86_cpuid.h b/volk/lib/gcc_x86_cpuid.h new file mode 100644 index 000000000..2d0916fb3 --- /dev/null +++ b/volk/lib/gcc_x86_cpuid.h @@ -0,0 +1,178 @@ +/* + * Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 3, or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Under Section 7 of GPL version 3, you are granted additional + * permissions described in the GCC Runtime Library Exception, version + * 3.1, as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License and + * a copy of the GCC Runtime Library Exception along with this program; + * see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + * . + */ + +/* %ecx */ +#define bit_SSE3 (1 << 0) +#define bit_PCLMUL (1 << 1) +#define bit_SSSE3 (1 << 9) +#define bit_FMA (1 << 12) +#define bit_CMPXCHG16B (1 << 13) +#define bit_SSE4_1 (1 << 19) +#define bit_SSE4_2 (1 << 20) +#define bit_MOVBE (1 << 22) +#define bit_POPCNT (1 << 23) +#define bit_AES (1 << 25) +#define bit_XSAVE (1 << 26) +#define bit_OSXSAVE (1 << 27) +#define bit_AVX (1 << 28) + +/* %edx */ +#define bit_CMPXCHG8B (1 << 8) +#define bit_CMOV (1 << 15) +#define bit_MMX (1 << 23) +#define bit_FXSAVE (1 << 24) +#define bit_SSE (1 << 25) +#define bit_SSE2 (1 << 26) + +/* Extended Features */ +/* %ecx */ +#define bit_LAHF_LM (1 << 0) +#define bit_SSE4a (1 << 6) +#define bit_SSE5 (1 << 11) + +/* %edx */ +#define bit_LM (1 << 29) +#define bit_3DNOWP (1 << 30) +#define bit_3DNOW (1 << 31) + + +#if defined(__i386__) && defined(__PIC__) +/* %ebx may be the PIC register. */ +#if __GNUC__ >= 3 +#define __cpuid(level, a, b, c, d) \ + __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchg{l}\t{%%}ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchg{l}\t{%%}ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#else +/* Host GCCs older than 3.0 weren't supporting Intel asm syntax + nor alternatives in i386 code. */ +#define __cpuid(level, a, b, c, d) \ + __asm__ ("xchgl\t%%ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchgl\t%%ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("xchgl\t%%ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchgl\t%%ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#endif +#else +#define __cpuid(level, a, b, c, d) \ + __asm__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#endif + +/* Return highest supported input value for cpuid instruction. ext can + be either 0x0 or 0x8000000 to return highest supported value for + basic or extended cpuid information. Function returns 0 if cpuid + is not supported or whatever cpuid returns in eax register. If sig + pointer is non-null, then first four bytes of the signature + (as found in ebx register) are returned in location pointed by sig. */ + +static __inline unsigned int +__get_cpuid_max (unsigned int __ext, unsigned int *__sig) +{ + unsigned int __eax, __ebx, __ecx, __edx; + +#ifndef __x86_64__ +#if __GNUC__ >= 3 + /* See if we can use cpuid. On AMD64 we always can. */ + __asm__ ("pushf{l|d}\n\t" + "pushf{l|d}\n\t" + "pop{l}\t%0\n\t" + "mov{l}\t{%0, %1|%1, %0}\n\t" + "xor{l}\t{%2, %0|%0, %2}\n\t" + "push{l}\t%0\n\t" + "popf{l|d}\n\t" + "pushf{l|d}\n\t" + "pop{l}\t%0\n\t" + "popf{l|d}\n\t" + : "=&r" (__eax), "=&r" (__ebx) + : "i" (0x00200000)); +#else +/* Host GCCs older than 3.0 weren't supporting Intel asm syntax + nor alternatives in i386 code. */ + __asm__ ("pushfl\n\t" + "pushfl\n\t" + "popl\t%0\n\t" + "movl\t%0, %1\n\t" + "xorl\t%2, %0\n\t" + "pushl\t%0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl\t%0\n\t" + "popfl\n\t" + : "=&r" (__eax), "=&r" (__ebx) + : "i" (0x00200000)); +#endif + + if (!((__eax ^ __ebx) & 0x00200000)) + return 0; +#endif + + /* Host supports cpuid. Return highest supported cpuid input value. */ + __cpuid (__ext, __eax, __ebx, __ecx, __edx); + + if (__sig) + *__sig = __ebx; + + return __eax; +} + +/* Return cpuid data for requested cpuid level, as found in returned + eax, ebx, ecx and edx registers. The function checks if cpuid is + supported and returns 1 for valid cpuid information or 0 for + unsupported cpuid level. All pointers are required to be non-null. */ + +static __inline int +__get_cpuid (unsigned int __level, + unsigned int *__eax, unsigned int *__ebx, + unsigned int *__ecx, unsigned int *__edx) +{ + unsigned int __ext = __level & 0x80000000; + + if (__get_cpuid_max (__ext, 0) < __level) + return 0; + + __cpuid (__level, *__eax, *__ebx, *__ecx, *__edx); + return 1; +} -- cgit From 05cc02cec03507c47846a668c92e6dcc4ba2e71e Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Tue, 18 Jan 2011 01:00:15 -0800 Subject: cleanup mkdir usage with boost filesystem: Replaced copy/pasted code and MKDIR_TAKES_ONE_ARG #ifdefs with portable boost filesystem path and directory creation. Gets the correct home directory on windows systems: APPDATA. Replaces large amounts of copypasta with single lines of code. Removes MKDIR_TAKES_ONE_ARG configuration checks from m4 files. Adds boost filesystem and system library as build dependencies. --- volk/config/gr_pwin32.m4 | 11 ----------- volk/config/mkstemp.m4 | 11 ----------- 2 files changed, 22 deletions(-) (limited to 'volk') diff --git a/volk/config/gr_pwin32.m4 b/volk/config/gr_pwin32.m4 index 7b99cba6b..c75535b8e 100644 --- a/volk/config/gr_pwin32.m4 +++ b/volk/config/gr_pwin32.m4 @@ -61,17 +61,6 @@ AC_TRY_LINK([ #include AC_MSG_RESULT(no) ) -dnl Under Win32, mkdir prototype in io.h has only one arg -AC_MSG_CHECKING(whether mkdir accepts only one arg) -AC_TRY_COMPILE([#include - #include - #include ], [ - mkdir("") - ], [ AC_MSG_RESULT(yes) - AC_DEFINE(MKDIR_TAKES_ONE_ARG,[],[Define if mkdir accepts only one arg]) ], - [ AC_MSG_RESULT(no) - ]) - AH_BOTTOM( [ /* Define missing prototypes, implemented in replacement lib */ diff --git a/volk/config/mkstemp.m4 b/volk/config/mkstemp.m4 index 4af0f0a9b..2d1fbee9b 100644 --- a/volk/config/mkstemp.m4 +++ b/volk/config/mkstemp.m4 @@ -75,15 +75,4 @@ AC_DEFUN([jm_PREREQ_TEMPNAME], AC_CHECK_FUNCS(__secure_getenv gettimeofday lstat) AC_CHECK_DECLS_ONCE(getenv) # AC_REQUIRE([jm_AC_TYPE_UINTMAX_T]) - - dnl Under Win32, mkdir prototype in io.h has only one arg - AC_MSG_CHECKING(whether mkdir accepts only one arg) - AC_TRY_COMPILE([#include - #include - #include ], [ - mkdir("") - ], [ AC_MSG_RESULT(yes) - AC_DEFINE(MKDIR_TAKES_ONE_ARG,[],[Define if mkdir accepts only one arg]) ], - [ AC_MSG_RESULT(no) - ]) ]) -- cgit From ebb0f56da62e9ff16928b32cc525f24c93a99e0b Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sat, 22 Jan 2011 14:54:20 -0500 Subject: Updating copyright. --- volk/config/gr_pwin32.m4 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'volk') diff --git a/volk/config/gr_pwin32.m4 b/volk/config/gr_pwin32.m4 index c75535b8e..b55bc64b1 100644 --- a/volk/config/gr_pwin32.m4 +++ b/volk/config/gr_pwin32.m4 @@ -1,6 +1,6 @@ # Check for (mingw)win32 POSIX replacements. -*- Autoconf -*- -# Copyright 2003,2004,2005 Free Software Foundation, Inc. +# Copyright 2003,2004,2005,2011 Free Software Foundation, Inc. # # This file is part of GNU Radio # -- cgit From 108a594c0838ad21f93cba6597d1f66af097b157 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Tue, 25 Jan 2011 10:37:49 -0500 Subject: volk: New volk kernel for conjugate dot products with unaligned buffers. Note: need to convert this to new naming standard. --- volk/include/volk/Makefile.am | 3 +- .../volk/volk_32fc_conjugate_dot_prod_unaligned.h | 144 +++++++++++++++++++++ volk/lib/Makefile.am | 2 + volk/lib/qa_32fc_conjugate_dot_prod_unaligned.cc | 138 ++++++++++++++++++++ volk/lib/qa_32fc_conjugate_dot_prod_unaligned.h | 18 +++ volk/lib/qa_volk.cc | 2 + 6 files changed, 306 insertions(+), 1 deletion(-) create mode 100644 volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h create mode 100644 volk/lib/qa_32fc_conjugate_dot_prod_unaligned.cc create mode 100644 volk/lib/qa_32fc_conjugate_dot_prod_unaligned.h (limited to 'volk') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 658974d3a..00289be1e 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -66,6 +66,7 @@ volkinclude_HEADERS = \ volk_32f_calc_spectral_noise_floor_aligned16.h \ volk_32fc_atan2_32f_aligned16.h \ volk_32fc_conjugate_dot_prod_aligned16.h \ + volk_32fc_conjugate_dot_prod_unaligned.h \ volk_32fc_deinterleave_32f_aligned16.h \ volk_32fc_deinterleave_64f_aligned16.h \ volk_32fc_deinterleave_real_16s_aligned16.h \ @@ -157,4 +158,4 @@ distclean-local: rm -f Makefile.in rm -f volk_environment_init.h rm -f volk_mktables - rm -f $(BUILT_SOURCES) \ No newline at end of file + rm -f $(BUILT_SOURCES) diff --git a/volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h b/volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h new file mode 100644 index 000000000..ead1573fe --- /dev/null +++ b/volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h @@ -0,0 +1,144 @@ +#ifndef INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_UNALIGNED_H +#define INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_UNALIGNED_H + + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_conjugate_dot_prod_unaligned_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + + } + /* + for(i = 0; i < num_bytes >> 3; ++i) { + *result += input[i] * conjf(taps[i]); + } + */ +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE3 + +#include +#include +#include + + +static inline void volk_32fc_conjugate_dot_prod_unaligned_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + union HalfMask { + uint32_t intRep[4]; + __m128 vec; + } halfMask; + + union NegMask { + int intRep[4]; + __m128 vec; + } negMask; + + unsigned int offset = 0; + float Rsum=0, Isum=0; + float Im,Re; + + __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is; + __m128 zv = {0,0,0,0}; + + halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF; + halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000; + + negMask.intRep[0] = negMask.intRep[2] = 0x80000000; + negMask.intRep[1] = negMask.intRep[3] = 0; + + // main loop + while(num_bytes >= 4*sizeof(float)){ + + in1 = _mm_loadu_ps( (float*) (input+offset) ); + in2 = _mm_loadu_ps( (float*) (taps+offset) ); + Rv = in1*in2; + fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); + Iv = in1*fehg; + Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv); + Ivm = _mm_xor_ps( negMask.vec, Iv ); + Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv); + _mm_store_ss( &Im, Is ); + _mm_store_ss( &Re, Rs ); + num_bytes -= 4*sizeof(float); + offset += 2; + Rsum += Re; + Isum += Im; + } + + // handle the last complex case ... + if(num_bytes > 0){ + + if(num_bytes != 4){ + // bad things are happening + } + + in1 = _mm_loadu_ps( (float*) (input+offset) ); + in2 = _mm_loadu_ps( (float*) (taps+offset) ); + Rv = _mm_and_ps(in1*in2, halfMask.vec); + fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); + Iv = _mm_and_ps(in1*fehg, halfMask.vec); + Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv); + Ivm = _mm_xor_ps( negMask.vec, Iv ); + Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv); + _mm_store_ss( &Im, Is ); + _mm_store_ss( &Re, Rs ); + Rsum += Re; + Isum += Im; + } + + result[0] = lv_32fc_init(Rsum,Isum); + return; +} + +#endif /*LV_HAVE_SSE3*/ + + +#endif /*INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_UNALIGNED_H*/ + + + diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 7a355e86a..beb815e63 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -93,6 +93,7 @@ libvolk_qa_la_SOURCES = \ qa_32fc_index_max_aligned16.cc \ qa_32f_index_max_aligned16.cc \ qa_32fc_conjugate_dot_prod_aligned16.cc \ + qa_32fc_conjugate_dot_prod_unaligned.cc \ qa_16s_permute_and_scalar_add_aligned16.cc \ qa_16s_branch_4_state_8_aligned16.cc \ qa_16s_max_star_horizontal_aligned16.cc \ @@ -195,6 +196,7 @@ noinst_HEADERS = \ qa_32fc_index_max_aligned16.h \ qa_32f_index_max_aligned16.h \ qa_32fc_conjugate_dot_prod_aligned16.h \ + qa_32fc_conjugate_dot_prod_unaligned.h \ qa_16s_permute_and_scalar_add_aligned16.h \ qa_16s_branch_4_state_8_aligned16.h \ qa_16s_max_star_horizontal_aligned16.h \ diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_unaligned.cc b/volk/lib/qa_32fc_conjugate_dot_prod_unaligned.cc new file mode 100644 index 000000000..a0680bab6 --- /dev/null +++ b/volk/lib/qa_32fc_conjugate_dot_prod_unaligned.cc @@ -0,0 +1,138 @@ +#include +#include +#include +#include +#include + + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +//test for sse + +#if LV_HAVE_SSE && LV_HAVE_64 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform () * 32767; +} + + +void qa_32fc_conjugate_dot_prod_unaligned::t1() { + const int vlen = 789743; + + volk_environment_init(); + int ret; + + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result, 16, 8); + + + result_generic[0] = std::complex(0,0); + result[0] = std::complex(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + + + volk_32fc_conjugate_dot_prod_unaligned_manual(result_generic, input, taps, vlen * 8, "generic"); + + + volk_32fc_conjugate_dot_prod_unaligned_manual(result, input, taps, vlen * 8, "sse"); + + printf("32fc_conjugate_dot_prod_unaligned\n"); + printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); + + assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result); + +} + + +#elif LV_HAVE_SSE && LV_HAVE_32 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform () * 32767; +} + + +void qa_32fc_conjugate_dot_prod_unaligned::t1() { + const int vlen = 789743; + + volk_environment_init(); + int ret; + + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result, 16, 8); + + + result_generic[0] = std::complex(0,0); + result[0] = std::complex(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + + + volk_32fc_conjugate_dot_prod_unaligned_manual(result_generic, input, taps, vlen * 8, "generic"); + + + volk_32fc_conjugate_dot_prod_unaligned_manual(result, input, taps, vlen * 8, "sse_32"); + + printf("32fc_conjugate_dot_prod_unaligned\n"); + printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); + + assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result); + +} + + +#else + +void qa_32fc_conjugate_dot_prod_unaligned::t1() { + printf("sse not available... no test performed\n"); +} + +#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_unaligned.h b/volk/lib/qa_32fc_conjugate_dot_prod_unaligned.h new file mode 100644 index 000000000..7aead53a1 --- /dev/null +++ b/volk/lib/qa_32fc_conjugate_dot_prod_unaligned.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_UNALIGNED_H +#define INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_UNALIGNED_H + +#include +#include + +class qa_32fc_conjugate_dot_prod_unaligned : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_conjugate_dot_prod_unaligned); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_UNALIGNED_H */ diff --git a/volk/lib/qa_volk.cc b/volk/lib/qa_volk.cc index c3c27b69b..98d3e9728 100644 --- a/volk/lib/qa_volk.cc +++ b/volk/lib/qa_volk.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -127,6 +128,7 @@ qa_volk::suite() s->addTest(qa_32fc_index_max_aligned16::suite()); s->addTest(qa_32f_index_max_aligned16::suite()); s->addTest(qa_32fc_conjugate_dot_prod_aligned16::suite()); + s->addTest(qa_32fc_conjugate_dot_prod_unaligned::suite()); s->addTest(qa_16s_permute_and_scalar_add_aligned16::suite()); s->addTest(qa_16s_branch_4_state_8_aligned16::suite()); s->addTest(qa_16s_max_star_horizontal_aligned16::suite()); -- cgit From f47466012c733526e04f8ceeb66a3677eec38cd9 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Tue, 25 Jan 2011 18:20:36 -0800 Subject: Volk: Orc impl for complex multiply fixed. Maybe some room for optimization. --- volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h | 5 ++--- volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'volk') diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h index 224ab19c8..b4214f5d2 100644 --- a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h +++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h @@ -81,10 +81,9 @@ static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, co \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points); +extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - static const float mask = -0.0; - volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, mask, num_points); + volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc b/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc index b72dfe8e7..a27d722cd 100644 --- a/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc +++ b/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc @@ -1,7 +1,18 @@ .function volk_32fc_x2_multiply_32fc_a16_orc_impl .source 8 src1 .source 8 src2 -.floatparam 4 mask .dest 8 dst -.temp 8 tmp -x2 mulf dst, src1, src2 +.temp 8 iqprod +.temp 4 real +.temp 4 imag +.temp 4 ac +.temp 4 bd +.temp 8 swapped +x2 mulf iqprod, src1, src2 +splitql bd, ac, iqprod +subf real, ac, bd +swaplq swapped, src1 +x2 mulf iqprod, swapped, src2 +splitql bd, ac, iqprod +addf imag, ac, bd +mergelq dst, real, imag -- cgit From 023167ca8a85ab597f9e59302733f71809a8afbd Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Tue, 25 Jan 2011 21:36:01 -0500 Subject: volk: Adding explicit links to local volk libraries. Required to prevent breakage when adding new volk kernels. --- volk/lib/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'volk') diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index beb815e63..446ff574f 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -285,7 +285,7 @@ noinst_PROGRAMS = \ test_all test_all_SOURCES = test_all.cc -test_all_LDADD = libvolk_qa.la +test_all_LDADD = libvolk.la libvolk_runtime.la libvolk_qa.la distclean-local: -- cgit From 2a4c4f89187bf75caa34c7bc52fc32310a75c9f2 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 26 Jan 2011 15:28:35 -0800 Subject: Volk: fixed volk_8i_s32f_convert_32f_a16_orc_impl. --- volk/include/volk/volk_8i_s32f_convert_32f_a16.h | 3 ++- volk/lib/testqa.cc | 6 +++--- volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc | 6 ++++-- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'volk') diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h index d5c8eeb51..99a24ec10 100644 --- a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h +++ b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h @@ -96,7 +96,8 @@ static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, con */ extern void volk_8i_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points); static inline void volk_8i_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){ - volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, scalar, num_points); + float invscalar = 1.0 / scalar; + volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, invscalar, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index 4cef7b443..d6b9e347d 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -49,8 +49,8 @@ BOOST_AUTO_TEST_CASE(volk_test_all) { VOLK_RUN_TESTS(volk_32f_s32f_convert_32i_u, 1, 2<<31, 2046, 10000); VOLK_RUN_TESTS(volk_32f_convert_64f_a16, 1e-4, 0, 2046, 10000); VOLK_RUN_TESTS(volk_32f_convert_64f_u, 1e-4, 0, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 0, 128, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 0, 128, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_a16, 1, 128, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_s32f_convert_8i_u, 1, 128, 2046, 10000); // VOLK_RUN_TESTS(volk_32fc_s32f_x2_power_spectral_density_32f_a16, 1e-4, 2046, 10000); VOLK_RUN_TESTS(volk_32fc_s32f_power_spectrum_32f_a16, 1e-4, 0, 2046, 10000); VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a16, 1e-4, 0, 2046, 10000); @@ -60,7 +60,7 @@ BOOST_AUTO_TEST_CASE(volk_test_all) { VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 2046, 10000); // VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a16, 1e-4, 2046, 10000); VOLK_RUN_TESTS(volk_32f_index_max_16u_a16, 0, 0, 2046, 10000); - VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 0, 32768, 2046, 10000); + VOLK_RUN_TESTS(volk_32f_x2_s32f_interleave_16ic_a16, 1, 32768, 2046, 10000); VOLK_RUN_TESTS(volk_32f_x2_interleave_32fc_a16, 0, 0, 2046, 10000); VOLK_RUN_TESTS(volk_32f_x2_max_32f_a16, 1e-4, 0, 2046, 10000); VOLK_RUN_TESTS(volk_32f_x2_min_32f_a16, 1e-4, 0, 2046, 10000); diff --git a/volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc b/volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc index 4e33f7b3b..8f6e157e9 100644 --- a/volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc +++ b/volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc @@ -1,9 +1,11 @@ .function volk_8i_s32f_convert_32f_a16_orc_impl -.source 2 src +.source 1 src .dest 4 dst .floatparam 4 scalar .temp 4 flsrc .temp 4 lsrc -convswl lsrc, src +.temp 2 ssrc +convsbw ssrc, src +convswl lsrc, ssrc convlf flsrc, lsrc mulf dst, flsrc, scalar -- cgit From 5ebd9ef2580aa36cd3a636c6257bd4b80b2380f8 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 26 Jan 2011 15:44:40 -0800 Subject: Volk: find built headers instead of installed ones --- volk/lib/Makefile.am | 2 +- volk/lib/testqa.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'volk') diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index afd29a352..6f3d7fd86 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -120,7 +120,7 @@ noinst_PROGRAMS = \ testqa testqa_SOURCES = testqa.cc qa_utils.cc -testqa_CPPFLAGS = -DBOOST_TEST_DYN_LINK -DBOOST_TEST_MAIN +testqa_CPPFLAGS = -DBOOST_TEST_DYN_LINK -DBOOST_TEST_MAIN $(AM_CPPFLAGS) testqa_LDFLAGS = $(BOOST_UNIT_TEST_FRAMEWORK_LIB) if LV_HAVE_ORC testqa_LDADD = \ diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index d6b9e347d..e9734411b 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -1,6 +1,6 @@ #include "qa_utils.h" -#include "../include/volk/volk.h" -#include "../include/volk/volk_registry.h" +#include +#include #include BOOST_AUTO_TEST_CASE(volk_test_all) { -- cgit From e34a484084a5224ec3412bd7d6c6f285301f5d43 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 26 Jan 2011 15:47:56 -0800 Subject: Volk: renamed volk_32fc_32f_power_32fc_a16 to volk_32fc_s32f_power_32fc_a16 --- volk/include/volk/Makefile.am | 2 +- volk/include/volk/volk_32fc_32f_power_32fc_a16.h | 109 ---------------------- volk/include/volk/volk_32fc_s32f_power_32fc_a16.h | 109 ++++++++++++++++++++++ volk/lib/testqa.cc | 2 +- 4 files changed, 111 insertions(+), 111 deletions(-) delete mode 100644 volk/include/volk/volk_32fc_32f_power_32fc_a16.h create mode 100644 volk/include/volk/volk_32fc_s32f_power_32fc_a16.h (limited to 'volk') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index 1eb46b602..eb97775b0 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -62,7 +62,7 @@ volkinclude_HEADERS = \ volk_32f_accumulator_s32f_a16.h \ volk_32f_x2_add_32f_a16.h \ volk_32fc_32f_multiply_32fc_a16.h \ - volk_32fc_32f_power_32fc_a16.h \ + volk_32fc_s32f_power_32fc_a16.h \ volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h \ volk_32fc_s32f_atan2_32f_a16.h \ volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \ diff --git a/volk/include/volk/volk_32fc_32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_32f_power_32fc_a16.h deleted file mode 100644 index 6f9e9e3ee..000000000 --- a/volk/include/volk/volk_32fc_32f_power_32fc_a16.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef INCLUDED_volk_32fc_32f_power_32fc_a16_H -#define INCLUDED_volk_32fc_32f_power_32fc_a16_H - -#include -#include - -#if LV_HAVE_SSE -#include - -#if LV_HAVE_LIB_SIMDMATH -#include -#endif /* LV_HAVE_LIB_SIMDMATH */ - -/*! - \brief Takes each the input complex vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The complex vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector -*/ -static inline void volk_32fc_32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - -#if LV_HAVE_LIB_SIMDMATH - __m128 vPower = _mm_set_ps1(power); - - __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; - for(;number < quarterPoints; number++){ - - cplxValue1 = _mm_load_ps((float*)aPtr); - aPtr += 2; - - cplxValue2 = _mm_load_ps((float*)aPtr); - aPtr += 2; - - // Convert to polar coordinates - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - phase = atan2f4(qValue, iValue); // Calculate the Phase - - magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values - - // Now calculate the power of the polar coordinate data - magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power - - phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power - - // Convert back to cartesian coordinates - iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude - qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude - - cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values - cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values - - _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container - - cPtr += 2; - - _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container - - cPtr += 2; - } - - number = quarterPoints * 4; -#endif /* LV_HAVE_LIB_SIMDMATH */ - - lv_32fc_t complexPower; - ((float*)&complexPower)[0] = power; - ((float*)&complexPower)[1] = 0; - for(;number < num_points; number++){ - *cPtr++ = lv_cpow((*aPtr++), complexPower); - } -} -#endif /* LV_HAVE_SSE */ - -#if LV_HAVE_GENERIC - /*! - \brief Takes each the input complex vector value to the specified power and stores the results in the return vector - \param cVector The vector where the results will be stored - \param aVector The complex vector of values to be taken to a power - \param power The power value to be applied to each data point - \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector - */ -static inline void volk_32fc_32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; - lv_32fc_t complexPower; - ((float*)&complexPower)[0] = power; - ((float*)&complexPower)[1] = 0.0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = lv_cpow((*aPtr++), complexPower); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_32fc_32f_power_32fc_a16_H */ diff --git a/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h new file mode 100644 index 000000000..3507fdb3c --- /dev/null +++ b/volk/include/volk/volk_32fc_s32f_power_32fc_a16.h @@ -0,0 +1,109 @@ +#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a16_H +#define INCLUDED_volk_32fc_s32f_power_32fc_a16_H + +#include +#include + +#if LV_HAVE_SSE +#include + +#if LV_HAVE_LIB_SIMDMATH +#include +#endif /* LV_HAVE_LIB_SIMDMATH */ + +/*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector +*/ +static inline void volk_32fc_s32f_power_32fc_a16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + +#if LV_HAVE_LIB_SIMDMATH + __m128 vPower = _mm_set_ps1(power); + + __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue; + for(;number < quarterPoints; number++){ + + cplxValue1 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + cplxValue2 = _mm_load_ps((float*)aPtr); + aPtr += 2; + + // Convert to polar coordinates + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + phase = atan2f4(qValue, iValue); // Calculate the Phase + + magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values + + // Now calculate the power of the polar coordinate data + magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power + + phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power + + // Convert back to cartesian coordinates + iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude + qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude + + cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values + cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values + + _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container + + cPtr += 2; + + _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container + + cPtr += 2; + } + + number = quarterPoints * 4; +#endif /* LV_HAVE_LIB_SIMDMATH */ + + lv_32fc_t complexPower; + ((float*)&complexPower)[0] = power; + ((float*)&complexPower)[1] = 0; + for(;number < num_points; number++){ + *cPtr++ = lv_cpow((*aPtr++), complexPower); + } +} +#endif /* LV_HAVE_SSE */ + +#if LV_HAVE_GENERIC + /*! + \brief Takes each the input complex vector value to the specified power and stores the results in the return vector + \param cVector The vector where the results will be stored + \param aVector The complex vector of values to be taken to a power + \param power The power value to be applied to each data point + \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector + */ +static inline void volk_32fc_s32f_power_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + lv_32fc_t complexPower; + ((float*)&complexPower)[0] = power; + ((float*)&complexPower)[1] = 0.0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_cpow((*aPtr++), complexPower); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a16_H */ diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index e9734411b..f33670856 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -29,7 +29,7 @@ BOOST_AUTO_TEST_CASE(volk_test_all) { VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a16, 1e-4, 0, 2046, 10000); VOLK_RUN_TESTS(volk_32f_x2_add_32f_a16, 1e-4, 0, 2046, 10000); VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc_a16, 1e-4, 0, 2046, 10000); - VOLK_RUN_TESTS(volk_32fc_32f_power_32fc_a16, 1e-4, 0, 2046, 1000); + VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc_a16, 1e-4, 0, 2046, 1000); VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a16, 1e-4, 20.0, 2046, 10000); VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a16, 1e-4, 10.0, 2046, 10000); VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000); -- cgit From a3ee5073e1035154a3331ec7ab4f1842095b7ed4 Mon Sep 17 00:00:00 2001 From: Nick Foster Date: Wed, 26 Jan 2011 17:19:29 -0800 Subject: Volk: fix for 32fc_s32f_magnitude_16i orc impl. --- volk/orc/volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'volk') diff --git a/volk/orc/volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc b/volk/orc/volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc index 9e2599084..6116f5e1f 100644 --- a/volk/orc/volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc +++ b/volk/orc/volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc @@ -16,7 +16,7 @@ splitql qf, if, prodiqf addf sumf, if, qf sqrtf rootf, sumf mulf rootf, rootf, scalar -cmpltf maskl, scalar, rootf +cmpltf maskl, 32768.0, rootf andl maskl, maskl, 0x80000000 orl rootf, rootf, maskl convfl rootl, rootf -- cgit From 6503e3b21978b71908400c994148836bec4a97b9 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Sun, 30 Jan 2011 12:35:07 -0500 Subject: volk: Updating build structure to work when orc is not installed. Distcheck passes for me if liborc is installed or not. --- volk/config/orc.m4 | 24 +++++++++++------------- volk/configure.ac | 5 +---- volk/include/volk/make_set_simd.py | 1 - volk/lib/Makefile.am | 5 ++--- 4 files changed, 14 insertions(+), 21 deletions(-) (limited to 'volk') diff --git a/volk/config/orc.m4 b/volk/config/orc.m4 index a4653400c..df0f3d6f3 100644 --- a/volk/config/orc.m4 +++ b/volk/config/orc.m4 @@ -7,10 +7,9 @@ AC_DEFUN([ORC_CHECK], [ ORC_REQ=ifelse([$1], , "0.4.10", [$1]) - enable_orc = auto if test "x$enable_orc" != "xno" ; then PKG_CHECK_MODULES(ORC, orc-0.4 >= $ORC_REQ, [ - AC_DEFINE(HAVE_ORC, 1, [Use Orc]) + AC_DEFINE(LV_HAVE_ORC, 1, [Use Orc]) if test "x$ORCC" = "x" ; then ORCC=`$PKG_CONFIG --variable=orcc orc-0.4` fi @@ -21,32 +20,31 @@ AC_DEFUN([ORC_CHECK], AC_SUBST(ORCC_FLAGS) AC_SUBST(ORC_LDFLAGS) AC_SUBST(ORC_CFLAGS) - HAVE_ORC=yes - HAVE_ORCC=yes + LV_HAVE_ORC=yes + LV_HAVE_ORCC=yes if test "x$cross_compiling" = "xyes" ; then - HAVE_ORCC=no + LV_HAVE_ORCC=no fi ], [ if test "x$enable_orc" = "xyes" ; then AC_MSG_ERROR([--enable-orc specified, but Orc >= $ORC_REQ not found]) fi AC_DEFINE(DISABLE_ORC, 1, [Disable Orc]) - HAVE_ORC=no - HAVE_ORCC=no + LV_HAVE_ORC=no + LV_HAVE_ORCC=no ]) else AC_DEFINE(DISABLE_ORC, 1, [Disable Orc]) - HAVE_ORC=no - HAVE_ORCC=no + LV_HAVE_ORC=no + LV_HAVE_ORCC=no fi - AM_CONDITIONAL(HAVE_ORC, [test "x$HAVE_ORC" = "xyes"]) - AM_CONDITIONAL(HAVE_ORCC, [test "x$HAVE_ORCC" = "xyes"]) - + AM_CONDITIONAL(LV_HAVE_ORC, [test "x$LV_HAVE_ORC" = "xyes"]) + AM_CONDITIONAL(LV_HAVE_ORCC, [test "x$LV_HAVE_ORCC" = "xyes"]) ])) AC_DEFUN([ORC_OUTPUT], [ - if test "$HAVE_ORC" = yes ; then + if test "$LV_HAVE_ORC" = yes ; then printf "configure: *** Orc acceleration enabled.\n" else if test "x$enable_orc" = "xno" ; then diff --git a/volk/configure.ac b/volk/configure.ac index eeb7441ba..c493adad6 100644 --- a/volk/configure.ac +++ b/volk/configure.ac @@ -68,12 +68,9 @@ AC_CONFIG_FILES([\ include/Makefile \ include/volk/Makefile \ lib/Makefile \ + orc/Makefile \ volk.pc \ ]) - -if test "$HAVE_ORC" = yes; then - AC_CONFIG_FILES([orc/Makefile]) -fi AC_OUTPUT diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py index f2b7c0656..019833b43 100644 --- a/volk/include/volk/make_set_simd.py +++ b/volk/include/volk/make_set_simd.py @@ -280,7 +280,6 @@ def make_set_simd(dom) : tempstring = tempstring + " ;;\n" tempstring = tempstring + " esac\n" tempstring = tempstring + " LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n" - tempstring = tempstring + " AM_CONDITIONAL(LV_HAVE_ORC, [test \"$LV_HAVE_ORC\" = \"yes\"])\n"; tempstring = tempstring + "])\n" return tempstring; diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index 6f3d7fd86..af7c7f335 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -45,7 +45,7 @@ AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) \ # list of programs run by "make check" and "make distcheck" -TESTS = testqa +#TESTS = testqa #orc stuff gets built in the ORC directory conditional to ORC being enabled. #it gets linked in during the build of libvolk as an added library. #there might be a better way to do this. @@ -77,7 +77,7 @@ libvolk_la_SOURCES = \ volk_orc_LDFLAGS = \ $(ORC_LDFLAGS) \ -lorc-0.4 - + volk_orc_LIBADD = \ ../orc/libvolk_orc.la @@ -103,7 +103,6 @@ endif #libvolk_qa_la_LIBADD = \ # libvolk.la \ # libvolk_runtime.la - # ---------------------------------------------------------------- # headers that don't get installed -- cgit From 736874202f15222fa3ec10ceeb1815e8a595ed3a Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 2 Feb 2011 13:55:15 -0500 Subject: volk: cleaning up makefile issues after merge. --- volk/lib/Makefile.am | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'volk') diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am index af7c7f335..3e5502369 100644 --- a/volk/lib/Makefile.am +++ b/volk/lib/Makefile.am @@ -57,7 +57,8 @@ lib_LTLIBRARIES = \ EXTRA_DIST = \ volk_mktables.c \ volk_rank_archs.h \ - volk_proccpu_sim.c + volk_proccpu_sim.c \ + gcc_x86_cpuid.h # ---------------------------------------------------------------- # The main library @@ -109,8 +110,7 @@ endif # ---------------------------------------------------------------- noinst_HEADERS = \ volk_init.h \ - qa_utils.h \ - assembly.h + qa_utils.h # ---------------------------------------------------------------- # Our test program -- cgit From b806f6e95cd917e54884841c8e7928204ecd78f8 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Wed, 2 Feb 2011 14:21:46 -0500 Subject: volk: updating to readd unaligned dot product under new name scheme. --- volk/include/volk/Makefile.am | 1 + .../volk/volk_32fc_conjugate_dot_prod_unaligned.h | 144 -------------- .../volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h | 144 ++++++++++++++ volk/lib/qa_32fc_conjugate_dot_prod_unaligned.cc | 138 ------------- volk/lib/qa_32fc_conjugate_dot_prod_unaligned.h | 18 -- volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc | 138 +++++++++++++ volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h | 18 ++ volk/lib/qa_volk.cc | 213 --------------------- volk/lib/testqa.cc | 1 + 9 files changed, 302 insertions(+), 513 deletions(-) delete mode 100644 volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h create mode 100644 volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h delete mode 100644 volk/lib/qa_32fc_conjugate_dot_prod_unaligned.cc delete mode 100644 volk/lib/qa_32fc_conjugate_dot_prod_unaligned.h create mode 100644 volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc create mode 100644 volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h delete mode 100644 volk/lib/qa_volk.cc (limited to 'volk') diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am index eb97775b0..7a5edd624 100644 --- a/volk/include/volk/Makefile.am +++ b/volk/include/volk/Makefile.am @@ -66,6 +66,7 @@ volkinclude_HEADERS = \ volk_32f_s32f_calc_spectral_noise_floor_32f_a16.h \ volk_32fc_s32f_atan2_32f_a16.h \ volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \ + volk_32fc_x2_conjugate_dot_prod_32fc_u.h \ volk_32fc_deinterleave_32f_x2_a16.h \ volk_32fc_deinterleave_64f_x2_a16.h \ volk_32fc_s32f_deinterleave_real_16i_a16.h \ diff --git a/volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h b/volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h deleted file mode 100644 index ead1573fe..000000000 --- a/volk/include/volk/volk_32fc_conjugate_dot_prod_unaligned.h +++ /dev/null @@ -1,144 +0,0 @@ -#ifndef INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_UNALIGNED_H -#define INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_UNALIGNED_H - - -#include - - -#if LV_HAVE_GENERIC - - -static inline void volk_32fc_conjugate_dot_prod_unaligned_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = (num_bytes >> 3) &1; - - - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - int i = 0; - - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - - sum0[0] += in[0] * tp[0] + in[1] * tp[1]; - sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] + in[3] * tp[3]; - sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; - - - in += 4; - tp += 4; - - } - - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - - - for(i = 0; i < isodd; ++i) { - - - *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); - - } - /* - for(i = 0; i < num_bytes >> 3; ++i) { - *result += input[i] * conjf(taps[i]); - } - */ -} - -#endif /*LV_HAVE_GENERIC*/ - -#if LV_HAVE_SSE3 - -#include -#include -#include - - -static inline void volk_32fc_conjugate_dot_prod_unaligned_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { - - static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; - - union HalfMask { - uint32_t intRep[4]; - __m128 vec; - } halfMask; - - union NegMask { - int intRep[4]; - __m128 vec; - } negMask; - - unsigned int offset = 0; - float Rsum=0, Isum=0; - float Im,Re; - - __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is; - __m128 zv = {0,0,0,0}; - - halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF; - halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000; - - negMask.intRep[0] = negMask.intRep[2] = 0x80000000; - negMask.intRep[1] = negMask.intRep[3] = 0; - - // main loop - while(num_bytes >= 4*sizeof(float)){ - - in1 = _mm_loadu_ps( (float*) (input+offset) ); - in2 = _mm_loadu_ps( (float*) (taps+offset) ); - Rv = in1*in2; - fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); - Iv = in1*fehg; - Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv); - Ivm = _mm_xor_ps( negMask.vec, Iv ); - Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv); - _mm_store_ss( &Im, Is ); - _mm_store_ss( &Re, Rs ); - num_bytes -= 4*sizeof(float); - offset += 2; - Rsum += Re; - Isum += Im; - } - - // handle the last complex case ... - if(num_bytes > 0){ - - if(num_bytes != 4){ - // bad things are happening - } - - in1 = _mm_loadu_ps( (float*) (input+offset) ); - in2 = _mm_loadu_ps( (float*) (taps+offset) ); - Rv = _mm_and_ps(in1*in2, halfMask.vec); - fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); - Iv = _mm_and_ps(in1*fehg, halfMask.vec); - Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv); - Ivm = _mm_xor_ps( negMask.vec, Iv ); - Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv); - _mm_store_ss( &Im, Is ); - _mm_store_ss( &Re, Rs ); - Rsum += Re; - Isum += Im; - } - - result[0] = lv_32fc_init(Rsum,Isum); - return; -} - -#endif /*LV_HAVE_SSE3*/ - - -#endif /*INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_UNALIGNED_H*/ - - - diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h new file mode 100644 index 000000000..2fa5918cc --- /dev/null +++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -0,0 +1,144 @@ +#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H +#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H + + +#include + + +#if LV_HAVE_GENERIC + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = (num_bytes >> 3) &1; + + + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + int i = 0; + + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + + sum0[0] += in[0] * tp[0] + in[1] * tp[1]; + sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] + in[3] * tp[3]; + sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2]; + + + in += 4; + tp += 4; + + } + + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + + + for(i = 0; i < isodd; ++i) { + + + *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]); + + } + /* + for(i = 0; i < num_bytes >> 3; ++i) { + *result += input[i] * conjf(taps[i]); + } + */ +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE3 + +#include +#include +#include + + +static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) { + + static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000}; + + union HalfMask { + uint32_t intRep[4]; + __m128 vec; + } halfMask; + + union NegMask { + int intRep[4]; + __m128 vec; + } negMask; + + unsigned int offset = 0; + float Rsum=0, Isum=0; + float Im,Re; + + __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is; + __m128 zv = {0,0,0,0}; + + halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF; + halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000; + + negMask.intRep[0] = negMask.intRep[2] = 0x80000000; + negMask.intRep[1] = negMask.intRep[3] = 0; + + // main loop + while(num_bytes >= 4*sizeof(float)){ + + in1 = _mm_loadu_ps( (float*) (input+offset) ); + in2 = _mm_loadu_ps( (float*) (taps+offset) ); + Rv = in1*in2; + fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); + Iv = in1*fehg; + Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv); + Ivm = _mm_xor_ps( negMask.vec, Iv ); + Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv); + _mm_store_ss( &Im, Is ); + _mm_store_ss( &Re, Rs ); + num_bytes -= 4*sizeof(float); + offset += 2; + Rsum += Re; + Isum += Im; + } + + // handle the last complex case ... + if(num_bytes > 0){ + + if(num_bytes != 4){ + // bad things are happening + } + + in1 = _mm_loadu_ps( (float*) (input+offset) ); + in2 = _mm_loadu_ps( (float*) (taps+offset) ); + Rv = _mm_and_ps(in1*in2, halfMask.vec); + fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1)); + Iv = _mm_and_ps(in1*fehg, halfMask.vec); + Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv); + Ivm = _mm_xor_ps( negMask.vec, Iv ); + Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv); + _mm_store_ss( &Im, Is ); + _mm_store_ss( &Re, Rs ); + Rsum += Re; + Isum += Im; + } + + result[0] = lv_32fc_init(Rsum,Isum); + return; +} + +#endif /*LV_HAVE_SSE3*/ + + +#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/ + + + diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_unaligned.cc b/volk/lib/qa_32fc_conjugate_dot_prod_unaligned.cc deleted file mode 100644 index a0680bab6..000000000 --- a/volk/lib/qa_32fc_conjugate_dot_prod_unaligned.cc +++ /dev/null @@ -1,138 +0,0 @@ -#include -#include -#include -#include -#include - - -#define assertcomplexEqual(expected, actual, delta) \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ - CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); - -#define ERR_DELTA (1e-4) - -//test for sse - -#if LV_HAVE_SSE && LV_HAVE_64 - -static float uniform() { - return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - for (unsigned i = 0; i < n; i++) - buf[i] = uniform () * 32767; -} - - -void qa_32fc_conjugate_dot_prod_unaligned::t1() { - const int vlen = 789743; - - volk_environment_init(); - int ret; - - std::complex* input; - std::complex* taps; - - std::complex* result_generic; - std::complex* result; - - ret = posix_memalign((void**)&input, 16, vlen << 3); - ret = posix_memalign((void**)&taps, 16, vlen << 3); - ret = posix_memalign((void**)&result_generic, 16, 8); - ret = posix_memalign((void**)&result, 16, 8); - - - result_generic[0] = std::complex(0,0); - result[0] = std::complex(0,0); - - random_floats((float*)input, vlen * 2); - random_floats((float*)taps, vlen * 2); - - - - volk_32fc_conjugate_dot_prod_unaligned_manual(result_generic, input, taps, vlen * 8, "generic"); - - - volk_32fc_conjugate_dot_prod_unaligned_manual(result, input, taps, vlen * 8, "sse"); - - printf("32fc_conjugate_dot_prod_unaligned\n"); - printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); - - assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); - - free(input); - free(taps); - free(result_generic); - free(result); - -} - - -#elif LV_HAVE_SSE && LV_HAVE_32 - -static float uniform() { - return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) -} - -static void -random_floats (float *buf, unsigned n) -{ - for (unsigned i = 0; i < n; i++) - buf[i] = uniform () * 32767; -} - - -void qa_32fc_conjugate_dot_prod_unaligned::t1() { - const int vlen = 789743; - - volk_environment_init(); - int ret; - - std::complex* input; - std::complex* taps; - - std::complex* result_generic; - std::complex* result; - - ret = posix_memalign((void**)&input, 16, vlen << 3); - ret = posix_memalign((void**)&taps, 16, vlen << 3); - ret = posix_memalign((void**)&result_generic, 16, 8); - ret = posix_memalign((void**)&result, 16, 8); - - - result_generic[0] = std::complex(0,0); - result[0] = std::complex(0,0); - - random_floats((float*)input, vlen * 2); - random_floats((float*)taps, vlen * 2); - - - - volk_32fc_conjugate_dot_prod_unaligned_manual(result_generic, input, taps, vlen * 8, "generic"); - - - volk_32fc_conjugate_dot_prod_unaligned_manual(result, input, taps, vlen * 8, "sse_32"); - - printf("32fc_conjugate_dot_prod_unaligned\n"); - printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); - - assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); - - free(input); - free(taps); - free(result_generic); - free(result); - -} - - -#else - -void qa_32fc_conjugate_dot_prod_unaligned::t1() { - printf("sse not available... no test performed\n"); -} - -#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_unaligned.h b/volk/lib/qa_32fc_conjugate_dot_prod_unaligned.h deleted file mode 100644 index 7aead53a1..000000000 --- a/volk/lib/qa_32fc_conjugate_dot_prod_unaligned.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_UNALIGNED_H -#define INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_UNALIGNED_H - -#include -#include - -class qa_32fc_conjugate_dot_prod_unaligned : public CppUnit::TestCase { - - CPPUNIT_TEST_SUITE (qa_32fc_conjugate_dot_prod_unaligned); - CPPUNIT_TEST (t1); - CPPUNIT_TEST_SUITE_END (); - - private: - void t1 (); -}; - - -#endif /* INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_UNALIGNED_H */ diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc new file mode 100644 index 000000000..fefdf06ee --- /dev/null +++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.cc @@ -0,0 +1,138 @@ +#include +#include +#include +#include +#include + + +#define assertcomplexEqual(expected, actual, delta) \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \ + CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta); + +#define ERR_DELTA (1e-4) + +//test for sse + +#if LV_HAVE_SSE && LV_HAVE_64 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform () * 32767; +} + + +void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { + const int vlen = 789743; + + volk_environment_init(); + int ret; + + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result, 16, 8); + + + result_generic[0] = std::complex(0,0); + result[0] = std::complex(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic"); + + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse"); + + printf("32fc_x2_conjugate_dot_prod_32fc_u\n"); + printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); + + assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result); + +} + + +#elif LV_HAVE_SSE && LV_HAVE_32 + +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform () * 32767; +} + + +void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { + const int vlen = 789743; + + volk_environment_init(); + int ret; + + std::complex* input; + std::complex* taps; + + std::complex* result_generic; + std::complex* result; + + ret = posix_memalign((void**)&input, 16, vlen << 3); + ret = posix_memalign((void**)&taps, 16, vlen << 3); + ret = posix_memalign((void**)&result_generic, 16, 8); + ret = posix_memalign((void**)&result, 16, 8); + + + result_generic[0] = std::complex(0,0); + result[0] = std::complex(0,0); + + random_floats((float*)input, vlen * 2); + random_floats((float*)taps, vlen * 2); + + + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result_generic, input, taps, vlen * 8, "generic"); + + + volk_32fc_x2_conjugate_dot_prod_32fc_u_manual(result, input, taps, vlen * 8, "sse_32"); + + printf("32fc_x2_conjugate_dot_prod_32fc_u\n"); + printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0])); + + assertcomplexEqual(result_generic[0], result[0], ERR_DELTA); + + free(input); + free(taps); + free(result_generic); + free(result); + +} + + +#else + +void qa_32fc_x2_conjugate_dot_prod_32fc_u::t1() { + printf("sse not available... no test performed\n"); +} + +#endif /*LV_HAVE_SSE*/ diff --git a/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h new file mode 100644 index 000000000..f07402403 --- /dev/null +++ b/volk/lib/qa_32fc_x2_conjugate_dot_prod_32fc_u.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H +#define INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H + +#include +#include + +class qa_32fc_x2_conjugate_dot_prod_32fc_u : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_x2_conjugate_dot_prod_32fc_u); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_X2_CONJUGATE_DOT_PROD_32FC_U_H */ diff --git a/volk/lib/qa_volk.cc b/volk/lib/qa_volk.cc deleted file mode 100644 index 98d3e9728..000000000 --- a/volk/lib/qa_volk.cc +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Copyright 2008 Free Software Foundation, Inc. - * - * This file is part of GNU Radio - * - * GNU Radio is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3, or (at your option) - * any later version. - * - * GNU Radio is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Radio; see the file COPYING. If not, write to - * the Free Software Foundation, Inc., 51 Franklin Street, - * Boston, MA 02110-1301, USA. - */ - -/* - * This class gathers together all the test cases for the example - * directory into a single test suite. As you create new test cases, - * add them here. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -CppUnit::TestSuite * -qa_volk::suite() -{ - CppUnit::TestSuite *s = new CppUnit::TestSuite("volk"); - - s->addTest(qa_16s_quad_max_star_aligned16::suite()); - s->addTest(qa_32fc_dot_prod_aligned16::suite()); - s->addTest(qa_32fc_square_dist_scalar_mult_aligned16::suite()); - s->addTest(qa_32fc_square_dist_aligned16::suite()); - s->addTest(qa_32f_sum_of_poly_aligned16::suite()); - s->addTest(qa_32fc_index_max_aligned16::suite()); - s->addTest(qa_32f_index_max_aligned16::suite()); - s->addTest(qa_32fc_conjugate_dot_prod_aligned16::suite()); - s->addTest(qa_32fc_conjugate_dot_prod_unaligned::suite()); - s->addTest(qa_16s_permute_and_scalar_add_aligned16::suite()); - s->addTest(qa_16s_branch_4_state_8_aligned16::suite()); - s->addTest(qa_16s_max_star_horizontal_aligned16::suite()); - s->addTest(qa_16s_max_star_aligned16::suite()); - s->addTest(qa_16s_add_quad_aligned16::suite()); - s->addTest(qa_32f_add_aligned16::suite()); - s->addTest(qa_32f_subtract_aligned16::suite()); - s->addTest(qa_32f_max_aligned16::suite()); - s->addTest(qa_32f_min_aligned16::suite()); - s->addTest(qa_64f_max_aligned16::suite()); - s->addTest(qa_64f_min_aligned16::suite()); - s->addTest(qa_32s_and_aligned16::suite()); - s->addTest(qa_32s_or_aligned16::suite()); - s->addTest(qa_32f_dot_prod_aligned16::suite()); - s->addTest(qa_32f_dot_prod_unaligned16::suite()); - s->addTest(qa_32f_fm_detect_aligned16::suite()); - s->addTest(qa_32fc_32f_multiply_aligned16::suite()); - s->addTest(qa_32fc_multiply_aligned16::suite()); - s->addTest(qa_32f_divide_aligned16::suite()); - s->addTest(qa_32f_multiply_aligned16::suite()); - s->addTest(qa_32f_sqrt_aligned16::suite()); - s->addTest(qa_8sc_multiply_conjugate_16sc_aligned16::suite()); - s->addTest(qa_8sc_multiply_conjugate_32fc_aligned16::suite()); - s->addTest(qa_32u_popcnt_aligned16::suite()); - s->addTest(qa_64u_popcnt_aligned16::suite()); - s->addTest(qa_16u_byteswap_aligned16::suite()); - s->addTest(qa_32u_byteswap_aligned16::suite()); - s->addTest(qa_64u_byteswap_aligned16::suite()); - s->addTest(qa_32f_normalize_aligned16::suite()); - s->addTest(qa_16sc_deinterleave_16s_aligned16::suite()); - s->addTest(qa_16sc_deinterleave_32f_aligned16::suite()); - s->addTest(qa_16sc_deinterleave_real_16s_aligned16::suite()); - s->addTest(qa_16sc_deinterleave_real_32f_aligned16::suite()); - s->addTest(qa_16sc_deinterleave_real_8s_aligned16::suite()); - s->addTest(qa_16sc_magnitude_16s_aligned16::suite()); - s->addTest(qa_16sc_magnitude_32f_aligned16::suite()); - s->addTest(qa_32fc_deinterleave_32f_aligned16::suite()); - s->addTest(qa_32fc_deinterleave_64f_aligned16::suite()); - s->addTest(qa_32fc_deinterleave_real_16s_aligned16::suite()); - s->addTest(qa_32fc_deinterleave_real_32f_aligned16::suite()); - s->addTest(qa_32fc_deinterleave_real_64f_aligned16::suite()); - s->addTest(qa_32fc_magnitude_16s_aligned16::suite()); - s->addTest(qa_32fc_magnitude_32f_aligned16::suite()); - s->addTest(qa_32f_interleave_16sc_aligned16::suite()); - s->addTest(qa_32f_interleave_32fc_aligned16::suite()); - s->addTest(qa_8sc_deinterleave_16s_aligned16::suite()); - s->addTest(qa_8sc_deinterleave_32f_aligned16::suite()); - s->addTest(qa_8sc_deinterleave_real_16s_aligned16::suite()); - s->addTest(qa_8sc_deinterleave_real_32f_aligned16::suite()); - s->addTest(qa_8sc_deinterleave_real_8s_aligned16::suite()); - s->addTest(qa_16s_convert_32f_aligned16::suite()); - s->addTest(qa_16s_convert_32f_unaligned16::suite()); - s->addTest(qa_16s_convert_8s_aligned16::suite()); - s->addTest(qa_16s_convert_8s_unaligned16::suite()); - s->addTest(qa_32f_convert_16s_aligned16::suite()); - s->addTest(qa_32f_convert_16s_unaligned16::suite()); - s->addTest(qa_32f_convert_32s_aligned16::suite()); - s->addTest(qa_32f_convert_32s_unaligned16::suite()); - s->addTest(qa_32f_convert_64f_aligned16::suite()); - s->addTest(qa_32f_convert_64f_unaligned16::suite()); - s->addTest(qa_32f_convert_8s_aligned16::suite()); - s->addTest(qa_32f_convert_8s_unaligned16::suite()); - s->addTest(qa_32s_convert_32f_aligned16::suite()); - s->addTest(qa_32s_convert_32f_unaligned16::suite()); - s->addTest(qa_64f_convert_32f_aligned16::suite()); - s->addTest(qa_64f_convert_32f_unaligned16::suite()); - s->addTest(qa_8s_convert_16s_aligned16::suite()); - s->addTest(qa_8s_convert_16s_unaligned16::suite()); - s->addTest(qa_8s_convert_32f_aligned16::suite()); - s->addTest(qa_8s_convert_32f_unaligned16::suite()); - s->addTest(qa_32fc_32f_power_32fc_aligned16::suite()); - s->addTest(qa_32f_power_aligned16::suite()); - s->addTest(qa_32fc_atan2_32f_aligned16::suite()); - s->addTest(qa_32fc_power_spectral_density_32f_aligned16::suite()); - s->addTest(qa_32fc_power_spectrum_32f_aligned16::suite()); - s->addTest(qa_32f_calc_spectral_noise_floor_aligned16::suite()); - s->addTest(qa_32f_accumulator_aligned16::suite()); - s->addTest(qa_32f_stddev_aligned16::suite()); - s->addTest(qa_32f_stddev_and_mean_aligned16::suite()); - - return s; -} diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc index f33670856..779bc61eb 100644 --- a/volk/lib/testqa.cc +++ b/volk/lib/testqa.cc @@ -33,6 +33,7 @@ BOOST_AUTO_TEST_CASE(volk_test_all) { VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f_a16, 1e-4, 20.0, 2046, 10000); VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f_a16, 1e-4, 10.0, 2046, 10000); VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_a16, 1e-4, 0, 2046, 10000); + VOLK_RUN_TESTS(volk_32fc_x2_conjugate_dot_prod_32fc_u, 1e-4, 0, 2046, 10000); VOLK_RUN_TESTS(volk_32fc_deinterleave_32f_x2_a16, 1e-4, 0, 2046, 10000); VOLK_RUN_TESTS(volk_32fc_deinterleave_64f_x2_a16, 1e-4, 0, 2046, 10000); VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a16, 0, 32768, 2046, 10000); -- cgit From ca64a7a905b1a5fa5e74dd730591f0f7e0ff3929 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Thu, 17 Feb 2011 15:48:41 -0500 Subject: volk: Properly sets up package config script if Orc is not found for Volk. --- volk/config/orc.m4 | 4 ++++ volk/volk.pc.in | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'volk') diff --git a/volk/config/orc.m4 b/volk/config/orc.m4 index df0f3d6f3..d17160a9a 100644 --- a/volk/config/orc.m4 +++ b/volk/config/orc.m4 @@ -22,6 +22,7 @@ AC_DEFUN([ORC_CHECK], AC_SUBST(ORC_CFLAGS) LV_HAVE_ORC=yes LV_HAVE_ORCC=yes + LV_ORC_PKGCONFIG="-lvolk_orc" if test "x$cross_compiling" = "xyes" ; then LV_HAVE_ORCC=no fi @@ -32,14 +33,17 @@ AC_DEFUN([ORC_CHECK], AC_DEFINE(DISABLE_ORC, 1, [Disable Orc]) LV_HAVE_ORC=no LV_HAVE_ORCC=no + LV_ORC_PKGCONFIG="" ]) else AC_DEFINE(DISABLE_ORC, 1, [Disable Orc]) LV_HAVE_ORC=no LV_HAVE_ORCC=no + LV_ORC_PKGCONFIG="" fi AM_CONDITIONAL(LV_HAVE_ORC, [test "x$LV_HAVE_ORC" = "xyes"]) AM_CONDITIONAL(LV_HAVE_ORCC, [test "x$LV_HAVE_ORCC" = "xyes"]) + AC_SUBST(LV_ORC_PKGCONFIG) ])) AC_DEFUN([ORC_OUTPUT], diff --git a/volk/volk.pc.in b/volk/volk.pc.in index b03dbdada..85425ba64 100644 --- a/volk/volk.pc.in +++ b/volk/volk.pc.in @@ -10,6 +10,6 @@ Name: volk Description: VOLK.. Vector Optimized Library of Kernels Requires: Version: @VERSION@ -Libs: -lvolk -lvolk_runtime -lvolk_orc +Libs: -lvolk -lvolk_runtime @LV_ORC_PKGCONFIG@ Cflags: -I${includedir} ${LV_CXXFLAGS} -- cgit From 942e26851291e023dabf372b7dc38bd089dbd237 Mon Sep 17 00:00:00 2001 From: Don Ward Date: Sat, 5 Mar 2011 20:22:36 -0500 Subject: volk: fixing configuration for Cygwin builds. --- volk/config/gcc_version_workaround.m4 | 6 +++++- volk/config/lv_configure.m4 | 5 +++-- 2 files changed, 8 insertions(+), 3 deletions(-) mode change 100644 => 100755 volk/config/gcc_version_workaround.m4 mode change 100644 => 100755 volk/config/lv_configure.m4 (limited to 'volk') diff --git a/volk/config/gcc_version_workaround.m4 b/volk/config/gcc_version_workaround.m4 old mode 100644 new mode 100755 index b3ba0b6f3..3cd8a0cc7 --- a/volk/config/gcc_version_workaround.m4 +++ b/volk/config/gcc_version_workaround.m4 @@ -1,5 +1,9 @@ AC_DEFUN([LV_GCC_VERSION_WORKAROUND], [ + case "${host_os}" in + *cygwin*) + ;; + *) AC_REQUIRE([LF_CONFIGURE_CXX]) cxx_version=`$CXX --version` @@ -44,6 +48,6 @@ AC_DEFUN([LV_GCC_VERSION_WORKAROUND], fi - + esac ]) \ No newline at end of file diff --git a/volk/config/lv_configure.m4 b/volk/config/lv_configure.m4 old mode 100644 new mode 100755 index f98b2dc5b..dfa490cdf --- a/volk/config/lv_configure.m4 +++ b/volk/config/lv_configure.m4 @@ -98,12 +98,13 @@ dnl AM_CONDITIONAL([USE_PYTHON], [test "$with_python" = yes]) dnl Check for Mingw support GR_PWIN32 - GR_LIBGNURADIO_CORE_EXTRA_LDFLAGS + dnl GR_LIBGNURADIO_CORE_EXTRA_LDFLAGS dnl Check for liborc ORC_CHECK - LDFLAGS="$LDFLAGS $LIBGNURADIO_CORE_EXTRA_LDFLAGS" + dnl Following causes test for -lboost_unit_test_framework to fail on Cygwin + dnl LDFLAGS="$LDFLAGS $LIBGNURADIO_CORE_EXTRA_LDFLAGS" AC_CHECK_PROG([XMLTO],[xmlto],[yes],[]) AM_CONDITIONAL([HAS_XMLTO], [test x$XMLTO = xyes]) -- cgit From b013372e7e02461bf5e67845b333030eee164bea Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Tue, 8 Mar 2011 16:33:17 -0800 Subject: volk: replace posix_memalign with something cross platform --- volk/lib/qa_utils.cc | 57 ++++++++++++++++++++++++---------------------------- volk/lib/qa_utils.h | 2 +- 2 files changed, 27 insertions(+), 32 deletions(-) (limited to 'volk') diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index e85e2c1bc..710d56fb8 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -1,19 +1,20 @@ #include "qa_utils.h" -#include +#include #include #include #include //#include #include #include -#include -#include +#include +#include #include //#include #include #include #include #include +#include float uniform() { return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) @@ -61,22 +62,6 @@ void load_random_data(void *data, volk_type_t type, unsigned int n) { } } -void *make_aligned_buffer(unsigned int len, unsigned int size) { - void *buf; - int ret; - ret = posix_memalign((void**)&buf, 16, len * size); - assert(ret == 0); - memset(buf, 0x00, len*size); - return buf; -} - -void make_buffer_for_signature(std::vector &buffs, std::vector inputsig, unsigned int vlen) { - BOOST_FOREACH(volk_type_t sig, inputsig) { - if(!sig.is_scalar) //we don't make buffers for scalars - buffs.push_back(make_aligned_buffer(vlen, sig.size*(sig.is_complex ? 2 : 1))); - } -} - static std::vector get_arch_list(const int archs[]) { std::vector archlist; int num_archs = archs[0]; @@ -282,6 +267,18 @@ bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) { return fail; } +class volk_qa_aligned_mem_pool{ +public: + void *get_new(size_t size, size_t alignment = 16){ + boost::shared_array mem(new char[size + alignment-1]); + size_t ptr = size_t(mem.get() + alignment-1) & ~(alignment-1); + std::memset((void *)ptr, 0x00, size); + _mems.push_back(mem); + return (void *)ptr; + } +private: std::vector > _mems; +}; + bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, float tol, float scalar, int vlen, int iter) { std::cout << "RUN_VOLK_TESTS: " << name << std::endl; @@ -292,7 +289,10 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, std::cout << "no architectures to test" << std::endl; return false; } - + + //something that can hang onto memory and cleanup when this function exits + volk_qa_aligned_mem_pool mem_pool; + //now we have to get a function signature by parsing the name std::vector inputsig, outputsig; get_signatures_from_name(inputsig, outputsig, name); @@ -309,12 +309,12 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, //for(int i=0; i inbuffs; - std::vector free_buffs; //this is just a list of void*'s that i'll have to free later. - //we need it because we dupe void*s in test_data below. - make_buffer_for_signature(inbuffs, inputsig, vlen); + BOOST_FOREACH(volk_type_t sig, inputsig) { + if(!sig.is_scalar) //we don't make buffers for scalars + inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1))); + } for(int i=0; i arch_buffs; for(int j=0; j +#include #include struct volk_type_t { -- cgit From 6673be777cd5395ae867e67db8c95aa09066617a Mon Sep 17 00:00:00 2001 From: Johnathan Corgan Date: Sat, 12 Mar 2011 15:47:40 -0800 Subject: Added/updated ignore files. --- volk/lib/.gitignore | 1 + volk/orc/.gitignore | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 volk/orc/.gitignore (limited to 'volk') diff --git a/volk/lib/.gitignore b/volk/lib/.gitignore index 0f17543ab..6a5fde28f 100644 --- a/volk/lib/.gitignore +++ b/volk/lib/.gitignore @@ -20,3 +20,4 @@ /volk_proccpu_sim.c /volk_runtime.c /test_all +/testqa diff --git a/volk/orc/.gitignore b/volk/orc/.gitignore new file mode 100644 index 000000000..b336cc7ce --- /dev/null +++ b/volk/orc/.gitignore @@ -0,0 +1,2 @@ +/Makefile +/Makefile.in -- cgit From 888beebf6015d9a88dbd1c3c842cf2490899a99b Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Mon, 14 Mar 2011 09:33:00 -0700 Subject: volk: simplify the get new method for the aligned pool --- volk/lib/qa_utils.cc | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'volk') diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc index 710d56fb8..b0f63d2b5 100644 --- a/volk/lib/qa_utils.cc +++ b/volk/lib/qa_utils.cc @@ -6,6 +6,7 @@ //#include #include #include +#include #include #include #include @@ -14,7 +15,6 @@ #include #include #include -#include float uniform() { return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) @@ -270,13 +270,11 @@ bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) { class volk_qa_aligned_mem_pool{ public: void *get_new(size_t size, size_t alignment = 16){ - boost::shared_array mem(new char[size + alignment-1]); - size_t ptr = size_t(mem.get() + alignment-1) & ~(alignment-1); - std::memset((void *)ptr, 0x00, size); - _mems.push_back(mem); - return (void *)ptr; + _mems.push_back(std::vector(size + alignment-1, 0)); + size_t ptr = size_t(&_mems.back().front()); + return (void *)((ptr + alignment-1) & ~(alignment-1)); } -private: std::vector > _mems; +private: std::list > _mems; }; bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, float tol, float scalar, int vlen, int iter) { -- cgit